rx: Better translation of char-matching patterns
Translate or-patterns that (even partially) match single characters into character alternatives which are more efficient in matching, sometimes algorithmically so. Example: (or "%" (in "a-z") space) was previously translated to "%\\|[a-z]\\|[[:space:]]" but now becomes "[%a-z[:space:]]" Single-char patterns include `nonl` and `anychar`, which now can also be used in set operations (union, complement and intersection), and character classes. For example, `(or nonl "\n")` is now equivalent to `anychar`. * lisp/emacs-lisp/rx.el (rx--expand-def): Remove, split into... (rx--expand-def-form, rx--expand-def-symbol): ...these. (rx--translate-compat-symbol-entry) (rx--translate-compat-form-entry): New functions for handling the legacy extension mechanism. (rx--normalise-or-arg): Renamed to... (rx--normalise-char-pattern): ...this, and rewrite. (rx--all-string-or-args): Remove, split into... (rx--all-string-branches-p, rx--collect-or-strings): ...these. (rx--char-alt-union, rx--intersection-intervals) (rx--reduce-to-char-alt, rx--optimise-or-args) (rx--translate-char-alt, rx--human-readable): New. (rx--translate-or, rx--translate-not, rx--translate-intersection): Rewrite. (rx--charset-p, rx--intervals-to-alt, rx--charset-intervals) (rx--charset-union, rx--charset-intersection, rx--charset-all) (rx--translate-union): Remove. (rx--generate-alt): Decide whether to generate a negated character alternative. (rx--complement-intervals, rx--intersect-intervals) (rx--union-intervals): Rename to... (rx--interval-set-complement, rx--interval-set-intersection) (rx--interval-set-union): ...these. (rx--translate-symbol, rx--translate-form): Refactor extension processing. Handle synthetic `rx--char-alt` form. * test/lisp/emacs-lisp/rx-tests.el (rx-or, rx-char-any-raw-byte) (rx-any, rx-charset-or): Adapt to changes and extend. * test/lisp/emacs-lisp/rx-tests.el (rx--complement-intervals) (rx--union-intervals, rx--intersect-intervals): Rename to... (rx--interval-set-complement, rx--interval-set-union) (rx--interval-set-intersection): ...these.
This commit is contained in:
parent
7b1eb9d753
commit
de6c1c4d5c
2 changed files with 416 additions and 238 deletions
|
@ -161,27 +161,23 @@ Each entry is:
|
|||
(or (cdr (assq name rx--local-definitions))
|
||||
(get name 'rx-definition)))
|
||||
|
||||
(defun rx--expand-def (form)
|
||||
"FORM expanded (once) if a user-defined construct; otherwise nil."
|
||||
(cond ((symbolp form)
|
||||
(let ((def (rx--lookup-def form)))
|
||||
(defun rx--expand-def-form (form)
|
||||
"List FORM expanded (once) if a user-defined construct; otherwise nil."
|
||||
(let ((op (car form)))
|
||||
(and (symbolp op)
|
||||
(let ((def (rx--lookup-def op)))
|
||||
(and def
|
||||
(if (cdr def)
|
||||
(error "Not an `rx' symbol definition: %s" form)
|
||||
(car def)))))
|
||||
((and (consp form) (symbolp (car form)))
|
||||
(let* ((op (car form))
|
||||
(def (rx--lookup-def op)))
|
||||
(and def
|
||||
(if (cdr def)
|
||||
(rx--expand-template
|
||||
op (cdr form) (nth 0 def) (nth 1 def))
|
||||
(rx--expand-template op (cdr form) (nth 0 def) (nth 1 def))
|
||||
(error "Not an `rx' form definition: %s" op)))))))
|
||||
|
||||
;; TODO: Additions to consider:
|
||||
;; - A construct like `or' but without the match order guarantee,
|
||||
;; maybe `unordered-or'. Useful for composition or generation of
|
||||
;; alternatives; permits more effective use of regexp-opt.
|
||||
(defun rx--expand-def-symbol (symbol)
|
||||
"SYM expanded (once) if a user-defined name; otherwise nil."
|
||||
(let ((def (rx--lookup-def symbol)))
|
||||
(and def
|
||||
(if (cdr def)
|
||||
(error "Not an `rx' symbol definition: %s" symbol)
|
||||
(car def)))))
|
||||
|
||||
(defun rx--translate-symbol (sym)
|
||||
"Translate an rx symbol. Return (REGEXP . PRECEDENCE)."
|
||||
|
@ -208,22 +204,13 @@ Each entry is:
|
|||
((let ((class (cdr (assq sym rx--char-classes))))
|
||||
(and class (cons (list (concat "[[:" (symbol-name class) ":]]")) t))))
|
||||
|
||||
((let ((expanded (rx--expand-def sym)))
|
||||
((let ((expanded (rx--expand-def-symbol sym)))
|
||||
(and expanded (rx--translate expanded))))
|
||||
|
||||
;; For compatibility with old rx.
|
||||
((let ((entry (assq sym rx-constituents)))
|
||||
(and (progn
|
||||
(while (and entry (not (stringp (cdr entry))))
|
||||
(setq entry
|
||||
(if (symbolp (cdr entry))
|
||||
;; Alias for another entry.
|
||||
(assq (cdr entry) rx-constituents)
|
||||
;; Wrong type, try further down the list.
|
||||
(assq (car entry)
|
||||
(cdr (memq entry rx-constituents))))))
|
||||
entry)
|
||||
(cons (list (cdr entry)) nil))))
|
||||
(and entry (rx--translate-compat-symbol-entry entry))))
|
||||
|
||||
(t (error "Unknown rx symbol `%s'" sym))))))
|
||||
|
||||
(defun rx--enclose (left-str rexp right-str)
|
||||
|
@ -289,83 +276,225 @@ Left-fold the list L, starting with X, by the binary function F."
|
|||
(setq l (cdr l)))
|
||||
x)
|
||||
|
||||
(defun rx--normalise-or-arg (form)
|
||||
"Normalize the `or' argument FORM.
|
||||
Characters become strings, user-definitions and `eval' forms are expanded,
|
||||
and `or' forms are normalized recursively."
|
||||
(cond ((characterp form)
|
||||
(char-to-string form))
|
||||
((and (consp form) (memq (car form) '(or |)))
|
||||
(cons (car form) (mapcar #'rx--normalise-or-arg (cdr form))))
|
||||
((and (consp form) (eq (car form) 'eval))
|
||||
(rx--normalise-or-arg (rx--expand-eval (cdr form))))
|
||||
(t
|
||||
(let ((expanded (rx--expand-def form)))
|
||||
(if expanded
|
||||
(rx--normalise-or-arg expanded)
|
||||
form)))))
|
||||
;; FIXME: flatten nested `or' patterns when performing char-pattern combining.
|
||||
;; The only reason for not flattening is to ensure regexp-opt processing
|
||||
;; (which we do for entire `or' patterns, not subsequences), but we
|
||||
;; obviously want to translate
|
||||
;; (or "a" space (or "b" (+ nonl) word) "c")
|
||||
;; -> (or (in "ab" space) (+ nonl) (in "c" word))
|
||||
|
||||
(defun rx--all-string-or-args (body)
|
||||
"If BODY only consists of strings or such `or' forms, return all the strings.
|
||||
Otherwise throw `rx--nonstring'."
|
||||
;; FIXME: normalise `seq', both the construct and implicit sequences,
|
||||
;; so that they are flattened, adjacent strings concatenated, and
|
||||
;; empty strings removed. That would give more opportunities for regexp-opt:
|
||||
;; (or "a" (seq "ab" (seq "c" "d") "")) -> (or "a" "abcd")
|
||||
|
||||
;; FIXME: Since `rx--normalise-char-pattern' recurses through `or', `not' and
|
||||
;; `intersection', we may end up normalising subtrees multiple times
|
||||
;; which wastes time (but should be idempotent).
|
||||
;; One way to avoid this is to aggressively normalise the entire tree
|
||||
;; before translating anything at all, but we must then recurse through
|
||||
;; all constructs and probably copy them.
|
||||
;; Such normalisation could normalise synonyms, eliminate `minimal-match'
|
||||
;; and `maximal-match' and convert affected `1+' to either `+' or `+?' etc.
|
||||
;; We would also consolidate the user-def lookup, both modern and legacy,
|
||||
;; in one place.
|
||||
|
||||
(defun rx--normalise-char-pattern (form)
|
||||
"Normalize FORM as a pattern matching a single-character.
|
||||
Characters become strings, `any' forms and character classes become
|
||||
`rx--char-alt' forms, user-definitions and `eval' forms are expanded,
|
||||
and `or', `not' and `intersection' forms are normalized recursively.
|
||||
|
||||
A `rx--char-alt' form is shaped (rx--char-alt INTERVALS . CLASSES)
|
||||
where INTERVALS is a sorted list of disjoint nonadjacent intervals,
|
||||
each a cons of characters, and CLASSES an unordered list of unique
|
||||
name-normalised character classes."
|
||||
(defvar rx--builtin-forms)
|
||||
(defvar rx--builtin-symbols)
|
||||
(cond ((consp form)
|
||||
(let ((op (car form))
|
||||
(body (cdr form)))
|
||||
(cond ((memq op '(or |))
|
||||
;; Normalise the constructor to `or' and the args recursively.
|
||||
(cons 'or (mapcar #'rx--normalise-char-pattern body)))
|
||||
;; Convert `any' forms and char classes now so that we
|
||||
;; don't need to do it later on.
|
||||
((memq op '(any in char))
|
||||
(cons 'rx--char-alt (rx--parse-any body)))
|
||||
((memq op '(not intersection))
|
||||
(cons op (mapcar #'rx--normalise-char-pattern body)))
|
||||
((eq op 'eval)
|
||||
(rx--normalise-char-pattern (rx--expand-eval body)))
|
||||
((memq op rx--builtin-forms) form)
|
||||
((let ((expanded (rx--expand-def-form form)))
|
||||
(and expanded
|
||||
(rx--normalise-char-pattern expanded))))
|
||||
(t form))))
|
||||
;; FIXME: Should we expand legacy definitions from
|
||||
;; `rx-constituents' here as well?
|
||||
((symbolp form)
|
||||
(cond ((let ((class (assq form rx--char-classes)))
|
||||
(and class
|
||||
`(rx--char-alt nil . (,(cdr class))))))
|
||||
((memq form rx--builtin-symbols) form)
|
||||
((let ((expanded (rx--expand-def-symbol form)))
|
||||
(and expanded
|
||||
(rx--normalise-char-pattern expanded))))
|
||||
(t form)))
|
||||
((characterp form)
|
||||
(char-to-string form))
|
||||
(t form)))
|
||||
|
||||
(defun rx--char-alt-union (a b)
|
||||
"Union of the (INTERVALS . CLASSES) pairs A and B."
|
||||
(let* ((a-cl (cdr a))
|
||||
(b-cl (cdr b))
|
||||
(classes (if (and a-cl b-cl)
|
||||
(let ((acc a-cl))
|
||||
(dolist (c b-cl)
|
||||
(unless (memq c a-cl)
|
||||
(push c acc)))
|
||||
acc)
|
||||
(or a-cl b-cl))))
|
||||
(cons (rx--interval-set-union (car a) (car b)) classes)))
|
||||
|
||||
(defun rx--intersection-intervals (forms)
|
||||
"Intersection of the normalised FORMS, as an interval set."
|
||||
(rx--foldl #'rx--interval-set-intersection '((0 . #x3fffff))
|
||||
(mapcar (lambda (x)
|
||||
(let ((char (rx--reduce-to-char-alt x)))
|
||||
(if (and char (null (cdr char)))
|
||||
(car char)
|
||||
(error "Cannot be used in rx intersection: %S"
|
||||
(rx--human-readable x)))))
|
||||
forms)))
|
||||
|
||||
(defun rx--reduce-to-char-alt (form)
|
||||
"Transform FORM into (INTERVALS . CLASSES) or nil if not possible.
|
||||
Process `or', `intersection' and `not'.
|
||||
FORM must be normalised (from `rx--normalise-char-pattern')."
|
||||
(cond
|
||||
((stringp form)
|
||||
(and (= (length form) 1)
|
||||
(let ((c (aref form 0)))
|
||||
(list (list (cons c c))))))
|
||||
((consp form)
|
||||
(let ((head (car form)))
|
||||
(cond
|
||||
;; FIXME: Transform `digit', `xdigit', `cntrl', `ascii', `nonascii'
|
||||
;; to ranges? That would allow them to be negated and intersected.
|
||||
((eq head 'rx--char-alt) (cdr form))
|
||||
((eq head 'not)
|
||||
(unless (= (length form) 2)
|
||||
(error "rx `not' form takes exactly one argument"))
|
||||
(let ((arg (rx--reduce-to-char-alt (cadr form))))
|
||||
;; Only interval sets without classes are closed under complement.
|
||||
(and arg (null (cdr arg))
|
||||
(list (rx--interval-set-complement (car arg))))))
|
||||
((eq head 'or)
|
||||
(let ((args (cdr form)))
|
||||
(let ((acc '(nil))) ; union identity
|
||||
(while (and args
|
||||
(let ((char (rx--reduce-to-char-alt (car args))))
|
||||
(setq acc (and char (rx--char-alt-union acc char)))))
|
||||
(setq args (cdr args)))
|
||||
acc)))
|
||||
((eq head 'intersection)
|
||||
(list (rx--intersection-intervals (cdr form))))
|
||||
)))
|
||||
((memq form '(nonl not-newline any))
|
||||
'(((0 . 9) (11 . #x3fffff))))
|
||||
((memq form '(anychar anything))
|
||||
'(((0 . #x3fffff))))
|
||||
;; FIXME: A better handling of `unmatchable' would be:
|
||||
;; * (seq ... unmatchable ...) -> unmatchable
|
||||
;; * any or-pattern branch that is `unmatchable' is deleted
|
||||
;; * (REPEAT unmatchable) -> "", if REPEAT accepts 0 repetitions
|
||||
;; * (REPEAT unmatchable) -> unmatchable, otherwise
|
||||
;; if it's worth the trouble (probably not).
|
||||
((eq form 'unmatchable)
|
||||
'(nil))
|
||||
))
|
||||
|
||||
(defun rx--optimise-or-args (args)
|
||||
"Optimise `or' arguments. Return a new rx form.
|
||||
Each element of ARGS should have been normalised using
|
||||
`rx--normalise-char-pattern'."
|
||||
(if (null args)
|
||||
;; No arguments.
|
||||
'(rx--char-alt nil . nil) ; FIXME: not `unmatchable'?
|
||||
;; Join consecutive single-char branches into a char alt where possible.
|
||||
;; Ideally we should collect all single-char branches but that might
|
||||
;; alter matching order in some cases.
|
||||
(let ((branches nil)
|
||||
(prev-char nil))
|
||||
(while args
|
||||
(let* ((item (car args))
|
||||
(item-char (rx--reduce-to-char-alt item)))
|
||||
(if item-char
|
||||
(setq prev-char (if prev-char
|
||||
(rx--char-alt-union prev-char item-char)
|
||||
item-char))
|
||||
(when prev-char
|
||||
(push (cons 'rx--char-alt prev-char) branches)
|
||||
(setq prev-char nil))
|
||||
(push item branches)))
|
||||
(setq args (cdr args)))
|
||||
(when prev-char
|
||||
(push (cons 'rx--char-alt prev-char) branches))
|
||||
(if (cdr branches)
|
||||
(cons 'or (nreverse branches))
|
||||
(car branches)))))
|
||||
|
||||
(defun rx--all-string-branches-p (forms)
|
||||
"Whether FORMS are all strings or `or' forms with the same property."
|
||||
(rx--every (lambda (x) (or (stringp x)
|
||||
(and (eq (car-safe x) 'or)
|
||||
(rx--all-string-branches-p (cdr x)))))
|
||||
forms))
|
||||
|
||||
(defun rx--collect-or-strings (forms)
|
||||
"All strings from FORMS, which are strings or `or' forms."
|
||||
(mapcan (lambda (form)
|
||||
(cond ((stringp form) (list form))
|
||||
((and (consp form) (memq (car form) '(or |)))
|
||||
(rx--all-string-or-args (cdr form)))
|
||||
(t (throw 'rx--nonstring nil))))
|
||||
body))
|
||||
(if (stringp form)
|
||||
(list form)
|
||||
;; must be an `or' form
|
||||
(rx--collect-or-strings (cdr form))))
|
||||
forms))
|
||||
|
||||
;; TODO: Write a more general rx-level factoriser to replace
|
||||
;; `regexp-opt' for our purposes. It would handle non-literals:
|
||||
;;
|
||||
;; (or "ab" (: "a" space) "bc" (: "b" (+ digit)))
|
||||
;; -> (or (: "a" (in "b" space)) (: "b" (or "c" (+ digit))))
|
||||
;;
|
||||
;; As a minor side benefit we would get less useless bracketing.
|
||||
;; The main problem is how to deal with matching order, which `regexp-opt'
|
||||
;; alters in its own way.
|
||||
|
||||
(defun rx--translate-or (body)
|
||||
"Translate an or-pattern of zero or more rx items.
|
||||
Return (REGEXP . PRECEDENCE)."
|
||||
;; FIXME: Possible improvements:
|
||||
;;
|
||||
;; - Flatten sub-patterns first: (or (or A B) (or C D)) -> (or A B C D)
|
||||
;; Then call regexp-opt on runs of string arguments. Example:
|
||||
;; (or (+ digit) "CHARLIE" "CHAN" (+ blank))
|
||||
;; -> (or (+ digit) (or "CHARLIE" "CHAN") (+ blank))
|
||||
;;
|
||||
;; - Optimize single-character alternatives better:
|
||||
;; * classes: space, alpha, ...
|
||||
;; * (syntax S), for some S (whitespace, word)
|
||||
;; so that (or "@" "%" digit (any "A-Z" space) (syntax word))
|
||||
;; -> (any "@" "%" digit "A-Z" space word)
|
||||
;; -> "[A-Z@%[:digit:][:space:][:word:]]"
|
||||
(cond
|
||||
((null body) ; No items: a never-matching regexp.
|
||||
(rx--empty))
|
||||
((null (cdr body)) ; Single item.
|
||||
(rx--translate (car body)))
|
||||
(t
|
||||
(let* ((args (mapcar #'rx--normalise-or-arg body))
|
||||
(all-strings (catch 'rx--nonstring (rx--all-string-or-args args))))
|
||||
(cond
|
||||
(all-strings ; Only strings.
|
||||
(cons (list (regexp-opt all-strings nil))
|
||||
t))
|
||||
((rx--every #'rx--charset-p args) ; All charsets.
|
||||
(rx--translate-union nil args))
|
||||
(t
|
||||
(cons (append (car (rx--translate (car args)))
|
||||
(mapcan (lambda (item)
|
||||
(cons "\\|" (car (rx--translate item))))
|
||||
(cdr args)))
|
||||
nil)))))))
|
||||
|
||||
(defun rx--charset-p (form)
|
||||
"Whether FORM looks like a charset, only consisting of character intervals
|
||||
and set operations."
|
||||
(or (and (consp form)
|
||||
(or (and (memq (car form) '(any in char))
|
||||
(rx--every (lambda (x) (not (symbolp x))) (cdr form)))
|
||||
(and (memq (car form) '(not or | intersection))
|
||||
(rx--every #'rx--charset-p (cdr form)))))
|
||||
(characterp form)
|
||||
(and (stringp form) (= (length form) 1))
|
||||
(and (or (symbolp form) (consp form))
|
||||
(let ((expanded (rx--expand-def form)))
|
||||
(and expanded
|
||||
(rx--charset-p expanded))))))
|
||||
(let ((args (mapcar #'rx--normalise-char-pattern body)))
|
||||
(if (rx--all-string-branches-p args)
|
||||
;; All branches are strings: use `regexp-opt'.
|
||||
(cons (list (regexp-opt (rx--collect-or-strings args) nil))
|
||||
t)
|
||||
(let ((form (rx--optimise-or-args args)))
|
||||
(if (eq (car-safe form) 'or)
|
||||
(let ((branches (cdr form)))
|
||||
(cons (append (car (rx--translate (car branches)))
|
||||
(mapcan (lambda (item)
|
||||
(cons "\\|" (car (rx--translate item))))
|
||||
(cdr branches)))
|
||||
nil))
|
||||
(rx--translate form))))))))
|
||||
|
||||
(defun rx--string-to-intervals (str)
|
||||
"Decode STR as intervals: A-Z becomes (?A . ?Z), and the single
|
||||
|
@ -420,7 +549,7 @@ INTERVALS is a list of (START . END) with START ≤ END, sorted by START."
|
|||
(defun rx--parse-any (body)
|
||||
"Parse arguments of an (any ...) construct.
|
||||
Return (INTERVALS . CLASSES), where INTERVALS is a sorted list of
|
||||
disjoint intervals (each a cons of chars), and CLASSES
|
||||
disjoint nonadjacent intervals (each a cons of chars), and CLASSES
|
||||
a list of named character classes in the order they occur in BODY."
|
||||
(let ((classes nil)
|
||||
(strings nil)
|
||||
|
@ -447,7 +576,7 @@ a list of named character classes in the order they occur in BODY."
|
|||
(sort (append conses
|
||||
(mapcan #'rx--string-to-intervals strings))
|
||||
#'car-less-than-car))
|
||||
(reverse classes))))
|
||||
(nreverse classes))))
|
||||
|
||||
(defun rx--generate-alt (negated intervals classes)
|
||||
"Generate a character alternative. Return (REGEXP . PRECEDENCE).
|
||||
|
@ -456,6 +585,19 @@ list of disjoint intervals and CLASSES a list of named character
|
|||
classes."
|
||||
;; No, this is not pretty code. You try doing it in a way that is both
|
||||
;; elegant and efficient. Or just one of the two. I dare you.
|
||||
|
||||
;; Detect whether the interval set is better described in
|
||||
;; complemented form. This is not just a matter of aesthetics: any
|
||||
;; range that straddles the char-raw boundary will be mutilated by the
|
||||
;; regexp engine. Ranges from ASCII to raw bytes will exclude the
|
||||
;; all non-ASCII non-raw bytes, and ranges from non-ASCII Unicode
|
||||
;; to raw bytes are ignored.
|
||||
(unless (or classes
|
||||
;; Any interval set covering #x3fff7f should be negated.
|
||||
(rx--every (lambda (iv) (not (<= (car iv) #x3fff7f (cdr iv))))
|
||||
intervals))
|
||||
(setq negated (not negated))
|
||||
(setq intervals (rx--interval-set-complement intervals)))
|
||||
(cond
|
||||
;; Single character.
|
||||
((and intervals (eq (caar intervals) (cdar intervals))
|
||||
|
@ -547,28 +689,18 @@ classes."
|
|||
"]"))
|
||||
t)))))
|
||||
|
||||
(defun rx--translate-char-alt (negated body)
|
||||
"Translate a (rx--char-alt ...) construct. Return (REGEXP . PRECEDENCE).
|
||||
If NEGATED, negate the sense."
|
||||
(rx--generate-alt negated (car body) (cdr body)))
|
||||
|
||||
(defun rx--translate-any (negated body)
|
||||
"Translate an (any ...) construct. Return (REGEXP . PRECEDENCE).
|
||||
If NEGATED, negate the sense."
|
||||
(let ((parsed (rx--parse-any body)))
|
||||
(rx--generate-alt negated (car parsed) (cdr parsed))))
|
||||
|
||||
(defun rx--intervals-to-alt (negated intervals)
|
||||
"Generate a character alternative from an interval set.
|
||||
Return (REGEXP . PRECEDENCE).
|
||||
INTERVALS is a sorted list of disjoint intervals.
|
||||
If NEGATED, negate the sense."
|
||||
;; Detect whether the interval set is better described in
|
||||
;; complemented form. This is not just a matter of aesthetics: any
|
||||
;; range from ASCII to raw bytes will automatically exclude the
|
||||
;; entire non-ASCII Unicode range by the regexp engine.
|
||||
(if (rx--every (lambda (iv) (not (<= (car iv) #x3ffeff (cdr iv))))
|
||||
intervals)
|
||||
(rx--generate-alt negated intervals nil)
|
||||
(rx--generate-alt
|
||||
(not negated) (rx--complement-intervals intervals) nil)))
|
||||
|
||||
;; FIXME: Consider turning `not' into a variadic operator, following SRE:
|
||||
;; TODO: Consider turning `not' into a variadic operator, following SRE:
|
||||
;; (not A B) = (not (or A B)) = (intersection (not A) (not B)), and
|
||||
;; (not) = anychar.
|
||||
;; Maybe allow singleton characters as arguments.
|
||||
|
@ -578,43 +710,27 @@ If NEGATED, negate the sense."
|
|||
If NEGATED, negate the sense (thus making it positive)."
|
||||
(unless (and body (null (cdr body)))
|
||||
(error "rx `not' form takes exactly one argument"))
|
||||
(let ((arg (car body)))
|
||||
(cond
|
||||
((and (consp arg)
|
||||
(pcase (car arg)
|
||||
((or 'any 'in 'char)
|
||||
(rx--translate-any (not negated) (cdr arg)))
|
||||
('syntax
|
||||
(rx--translate-syntax (not negated) (cdr arg)))
|
||||
('category
|
||||
(rx--translate-category (not negated) (cdr arg)))
|
||||
('not
|
||||
(rx--translate-not (not negated) (cdr arg)))
|
||||
((or 'or '|)
|
||||
(rx--translate-union (not negated) (cdr arg)))
|
||||
('intersection
|
||||
(rx--translate-intersection (not negated) (cdr arg))))))
|
||||
((let ((class (cdr (assq arg rx--char-classes))))
|
||||
(and class
|
||||
(rx--generate-alt (not negated) nil (list class)))))
|
||||
((eq arg 'word-boundary)
|
||||
(rx--translate-symbol
|
||||
(if negated 'word-boundary 'not-word-boundary)))
|
||||
((characterp arg)
|
||||
(rx--generate-alt (not negated) (list (cons arg arg)) nil))
|
||||
((and (stringp arg) (= (length arg) 1))
|
||||
(let ((char (string-to-char arg)))
|
||||
(rx--generate-alt (not negated) (list (cons char char)) nil)))
|
||||
((let ((expanded (rx--expand-def arg)))
|
||||
(and expanded
|
||||
(rx--translate-not negated (list expanded)))))
|
||||
(t (error "Illegal argument to rx `not': %S" arg)))))
|
||||
(let ((arg (rx--normalise-char-pattern (car body))))
|
||||
(pcase arg
|
||||
(`(not . ,args)
|
||||
(rx--translate-not (not negated) args))
|
||||
(`(syntax . ,args)
|
||||
(rx--translate-syntax (not negated) args))
|
||||
(`(category . ,args)
|
||||
(rx--translate-category (not negated) args))
|
||||
('word-boundary ; legacy syntax
|
||||
(rx--translate-symbol (if negated 'word-boundary 'not-word-boundary)))
|
||||
(_ (let ((char (rx--reduce-to-char-alt arg)))
|
||||
(if char
|
||||
(rx--generate-alt (not negated) (car char) (cdr char))
|
||||
(error "Illegal argument to rx `not': %S"
|
||||
(rx--human-readable arg))))))))
|
||||
|
||||
(defun rx--complement-intervals (intervals)
|
||||
"Complement of the interval list INTERVALS."
|
||||
(defun rx--interval-set-complement (ivs)
|
||||
"Complement of the interval set IVS."
|
||||
(let ((compl nil)
|
||||
(c 0))
|
||||
(dolist (iv intervals)
|
||||
(dolist (iv ivs)
|
||||
(when (< c (car iv))
|
||||
(push (cons c (1- (car iv))) compl))
|
||||
(setq c (1+ (cdr iv))))
|
||||
|
@ -622,8 +738,8 @@ If NEGATED, negate the sense (thus making it positive)."
|
|||
(push (cons c (max-char)) compl))
|
||||
(nreverse compl)))
|
||||
|
||||
(defun rx--intersect-intervals (ivs-a ivs-b)
|
||||
"Intersection of the interval lists IVS-A and IVS-B."
|
||||
(defun rx--interval-set-intersection (ivs-a ivs-b)
|
||||
"Intersection of the interval sets IVS-A and IVS-B."
|
||||
(let ((isect nil))
|
||||
(while (and ivs-a ivs-b)
|
||||
(let ((a (car ivs-a))
|
||||
|
@ -645,8 +761,8 @@ If NEGATED, negate the sense (thus making it positive)."
|
|||
ivs-a)))))))
|
||||
(nreverse isect)))
|
||||
|
||||
(defun rx--union-intervals (ivs-a ivs-b)
|
||||
"Union of the interval lists IVS-A and IVS-B."
|
||||
(defun rx--interval-set-union (ivs-a ivs-b)
|
||||
"Union of the interval sets IVS-A and IVS-B."
|
||||
(let ((union nil))
|
||||
(while (and ivs-a ivs-b)
|
||||
(let ((a (car ivs-a))
|
||||
|
@ -670,53 +786,66 @@ If NEGATED, negate the sense (thus making it positive)."
|
|||
ivs-a))))))
|
||||
(nconc (nreverse union) (or ivs-a ivs-b))))
|
||||
|
||||
(defun rx--charset-intervals (charset)
|
||||
"Return a sorted list of non-adjacent disjoint intervals from CHARSET.
|
||||
CHARSET is any expression allowed in a character set expression:
|
||||
characters, single-char strings, `any' forms (no classes permitted),
|
||||
or `not', `or' or `intersection' forms whose arguments are charsets."
|
||||
(pcase charset
|
||||
(`(,(or 'any 'in 'char) . ,body)
|
||||
(let ((parsed (rx--parse-any body)))
|
||||
(when (cdr parsed)
|
||||
(error
|
||||
"Character class not permitted in set operations: %S"
|
||||
(cadr parsed)))
|
||||
(car parsed)))
|
||||
(`(not ,x) (rx--complement-intervals (rx--charset-intervals x)))
|
||||
(`(,(or 'or '|) . ,body) (rx--charset-union body))
|
||||
(`(intersection . ,body) (rx--charset-intersection body))
|
||||
((pred characterp)
|
||||
(list (cons charset charset)))
|
||||
((guard (and (stringp charset) (= (length charset) 1)))
|
||||
(let ((char (string-to-char charset)))
|
||||
(list (cons char char))))
|
||||
(_ (let ((expanded (rx--expand-def charset)))
|
||||
(if expanded
|
||||
(rx--charset-intervals expanded)
|
||||
(error "Bad character set: %S" charset))))))
|
||||
|
||||
(defun rx--charset-union (charsets)
|
||||
"Union of CHARSETS, as a set of intervals."
|
||||
(rx--foldl #'rx--union-intervals nil
|
||||
(mapcar #'rx--charset-intervals charsets)))
|
||||
|
||||
(defconst rx--charset-all (list (cons 0 (max-char))))
|
||||
|
||||
(defun rx--charset-intersection (charsets)
|
||||
"Intersection of CHARSETS, as a set of intervals."
|
||||
(rx--foldl #'rx--intersect-intervals rx--charset-all
|
||||
(mapcar #'rx--charset-intervals charsets)))
|
||||
|
||||
(defun rx--translate-union (negated body)
|
||||
"Translate an (or ...) construct of charsets. Return (REGEXP . PRECEDENCE).
|
||||
If NEGATED, negate the sense."
|
||||
(rx--intervals-to-alt negated (rx--charset-union body)))
|
||||
(defun rx--human-readable (form)
|
||||
"Turn FORM into something that is more human-readable, for error messages."
|
||||
;; FIXME: Should we produce a string instead?
|
||||
;; That way we wouldn't have problems with ? and ??, and we could escape
|
||||
;; single chars.
|
||||
;; We could steal `xr--rx-to-string' and just file off the serials.
|
||||
(let ((recurse (lambda (op skip)
|
||||
(cons op (append (take skip (cdr form))
|
||||
(mapcar #'rx--human-readable
|
||||
(nthcdr skip (cdr form))))))))
|
||||
(pcase form
|
||||
;; strings are more readable than numbers for single chars
|
||||
((pred characterp) (char-to-string form))
|
||||
;; resugar `rx--char-alt'
|
||||
(`(rx--char-alt ((,c . ,c)) . nil)
|
||||
(char-to-string form))
|
||||
(`(rx--char-alt nil . (,class))
|
||||
class)
|
||||
;; TODO: render in complemented form if more readable that way?
|
||||
(`(rx--char-alt ,ivs . ,classes)
|
||||
(let ((strings (mapcan (lambda (iv)
|
||||
(let ((beg (car iv))
|
||||
(end (cdr iv)))
|
||||
(cond
|
||||
;; single char
|
||||
((eq beg end)
|
||||
(list (string beg)))
|
||||
;; two chars
|
||||
((eq end (1+ beg))
|
||||
(list (string beg) (string end)))
|
||||
;; first char is hyphen
|
||||
((eq beg ?-)
|
||||
(cons (string "-")
|
||||
(if (eq end (+ ?- 2))
|
||||
(list (string (1+ ?-) end))
|
||||
(list (string (1+ ?-) ?- end)))))
|
||||
;; other range
|
||||
(t (list (string beg ?- end))))))
|
||||
ivs)))
|
||||
`(any ,@strings ,@classes)))
|
||||
;; avoid numbers as ops
|
||||
(`(? . ,_) (funcall recurse '\? 0))
|
||||
(`(?? . ,_) (funcall recurse '\?? 0))
|
||||
;; recurse on arguments
|
||||
(`(repeat ,_ ,_) (funcall recurse (car form) 1))
|
||||
(`(,(or '** 'repeat) . ,_) (funcall recurse (car form) 2))
|
||||
(`(,(or '= '>= 'group-n 'submatch-n) . ,_) (funcall recurse (car form) 1))
|
||||
(`(,(or 'backref 'syntax 'not-syntax 'category
|
||||
'eval 'regex 'regexp 'literal)
|
||||
. ,_)
|
||||
form)
|
||||
(`(,_ . ,_) (funcall recurse (car form) 0))
|
||||
(_ form))))
|
||||
|
||||
(defun rx--translate-intersection (negated body)
|
||||
"Translate an (intersection ...) construct. Return (REGEXP . PRECEDENCE).
|
||||
If NEGATED, negate the sense."
|
||||
(rx--intervals-to-alt negated (rx--charset-intersection body)))
|
||||
(rx--generate-alt negated (rx--intersection-intervals
|
||||
(mapcar #'rx--normalise-char-pattern body))
|
||||
nil))
|
||||
|
||||
(defun rx--atomic-regexp (item)
|
||||
"ITEM is (REGEXP . PRECEDENCE); return a regexp of precedence t."
|
||||
|
@ -1006,6 +1135,36 @@ DEF is the definition tuple. Return (REGEXP . PRECEDENCE)."
|
|||
(error "The `%s' form did not expand to a string" (car form)))
|
||||
(cons (list regexp) nil))))
|
||||
|
||||
(defun rx--translate-compat-symbol-entry (entry)
|
||||
"Translate a compatibility symbol definition for ENTRY.
|
||||
Return (REGEXP . PRECEDENCE) or nil if none."
|
||||
(and (progn
|
||||
(while (and entry (not (stringp (cdr entry))))
|
||||
(setq entry
|
||||
(if (symbolp (cdr entry))
|
||||
;; Alias for another entry.
|
||||
(assq (cdr entry) rx-constituents)
|
||||
;; Wrong type, try further down the list.
|
||||
(assq (car entry)
|
||||
(cdr (memq entry rx-constituents))))))
|
||||
entry)
|
||||
(cons (list (cdr entry)) nil)))
|
||||
|
||||
(defun rx--translate-compat-form-entry (orig-form entry)
|
||||
"Translate a compatibility ORIG-FORM definition for ENTRY.
|
||||
Return (REGEXP . PRECEDENCE) or nil if none."
|
||||
(and (progn
|
||||
(while (and entry (not (consp (cdr entry))))
|
||||
(setq entry
|
||||
(if (symbolp (cdr entry))
|
||||
;; Alias for another entry.
|
||||
(assq (cdr entry) rx-constituents)
|
||||
;; Wrong type, try further down the list.
|
||||
(assq (car entry)
|
||||
(cdr (memq entry rx-constituents))))))
|
||||
entry)
|
||||
(rx--translate-compat-form (cdr entry) orig-form)))
|
||||
|
||||
(defun rx--substitute (bindings form)
|
||||
"Substitute BINDINGS in FORM. BINDINGS is an alist of (NAME . VALUES)
|
||||
where VALUES is a list to splice into FORM wherever NAME occurs.
|
||||
|
@ -1101,6 +1260,7 @@ can expand to any number of values."
|
|||
((or 'seq : 'and 'sequence) (rx--translate-seq body))
|
||||
((or 'or '|) (rx--translate-or body))
|
||||
((or 'any 'in 'char) (rx--translate-any nil body))
|
||||
('rx--char-alt (rx--translate-char-alt nil body))
|
||||
('not-char (rx--translate-any t body))
|
||||
('not (rx--translate-not nil body))
|
||||
('intersection (rx--translate-intersection nil body))
|
||||
|
@ -1141,23 +1301,13 @@ can expand to any number of values."
|
|||
(cond
|
||||
((not (symbolp op)) (error "Bad rx operator `%S'" op))
|
||||
|
||||
((let ((expanded (rx--expand-def form)))
|
||||
((let ((expanded (rx--expand-def-form form)))
|
||||
(and expanded
|
||||
(rx--translate expanded))))
|
||||
|
||||
;; For compatibility with old rx.
|
||||
((let ((entry (assq op rx-constituents)))
|
||||
(and (progn
|
||||
(while (and entry (not (consp (cdr entry))))
|
||||
(setq entry
|
||||
(if (symbolp (cdr entry))
|
||||
;; Alias for another entry.
|
||||
(assq (cdr entry) rx-constituents)
|
||||
;; Wrong type, try further down the list.
|
||||
(assq (car entry)
|
||||
(cdr (memq entry rx-constituents))))))
|
||||
entry)
|
||||
(rx--translate-compat-form (cdr entry) form))))
|
||||
(and entry (rx--translate-compat-form-entry form entry))))
|
||||
|
||||
(t (error "Unknown rx form `%s'" op)))))))
|
||||
|
||||
|
|
|
@ -41,19 +41,31 @@
|
|||
(should (equal (rx "" (or "ab" nonl) "")
|
||||
"ab\\|.")))
|
||||
|
||||
;; FIXME: Extend tests for `or', `not' etc to cover char pattern combination,
|
||||
;; including (syntax whitespace) and (syntax word).
|
||||
|
||||
(ert-deftest rx-or ()
|
||||
(should (equal (rx (or "ab" (| "c" nonl) "de"))
|
||||
"ab\\|c\\|.\\|de"))
|
||||
(should (equal (rx (or "ab" (| "cd" nonl) "de"))
|
||||
"ab\\|cd\\|.\\|de"))
|
||||
(should (equal (rx (or "ab" "abc" ?a))
|
||||
"\\(?:a\\(?:bc?\\)?\\)"))
|
||||
(should (equal (rx (or "ab" (| (or "abcd" "abcde")) (or "a" "abc")))
|
||||
"\\(?:a\\(?:b\\(?:c\\(?:de?\\)?\\)?\\)?\\)"))
|
||||
(should (equal (rx (or "a" (eval (string ?a ?b))))
|
||||
"\\(?:ab?\\)"))
|
||||
(should (equal (rx (| nonl "ac") (| "bd" blank))
|
||||
"\\(?:.\\|ac\\)\\(?:bd\\|[[:blank:]]\\)"))
|
||||
(should (equal (rx (| nonl "a") (| "b" blank))
|
||||
"\\(?:.\\|a\\)\\(?:b\\|[[:blank:]]\\)"))
|
||||
".[b[:blank:]]"))
|
||||
(should (equal (rx (|))
|
||||
"\\`a\\`")))
|
||||
"\\`a\\`"))
|
||||
(should (equal (rx (or "a" (not anychar) punct ?c "b" (not (not ?d))))
|
||||
"[a-d[:punct:]]"))
|
||||
(should (equal (rx (or nonl ?\n))
|
||||
"[^z-a]"))
|
||||
(should (equal (rx (or "ab" "a" "b" blank (syntax whitespace) word "z"))
|
||||
"ab\\|[ab[:blank:]]\\|\\s-\\|[z[:word:]]"))
|
||||
)
|
||||
|
||||
(ert-deftest rx-def-in-or ()
|
||||
(rx-let ((a b)
|
||||
|
@ -101,14 +113,18 @@
|
|||
"[\177ÿ\200-\377]"))
|
||||
;; Range between normal chars and raw bytes: must be split to be parsed
|
||||
;; correctly by the Emacs regexp engine.
|
||||
(should (equal
|
||||
(rx (any (0 . #x3fffff)) (any (?G . #x3fff9a)) (any (?Ü . #x3ffff2)))
|
||||
"[\0-\x3fff7f\x80-\xff][G-\x3fff7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]"))
|
||||
(should (equal (rx (any (0 . #x3fffff) word) (any (?G . #x3fff9a) word)
|
||||
(any (?Ü . #x3ffff2) word))
|
||||
(concat "[\0-\x3fff7f\x80-\xff[:word:]]"
|
||||
"[G-\x3fff7f\x80-\x9a[:word:]]"
|
||||
"[Ü-\x3fff7f\x80-\xf2[:word:]]")))
|
||||
;; As above but with ranges in string form. For historical reasons,
|
||||
;; we special-case ASCII-to-raw ranges to exclude non-ASCII unicode.
|
||||
(should (equal
|
||||
(rx (any "\x00-\xff") (any "G-\x9a") (any "Ü-\xf2"))
|
||||
"[\0-\x7f\x80-\xff][G-\x7f\x80-\x9a][Ü-\x3fff7f\x80-\xf2]")))
|
||||
(should (equal (rx (any "\x00-\xff" alpha) (any "G-\x9a" alpha)
|
||||
(any "Ü-\xf2" alpha))
|
||||
(concat "[\0-\x7f\x80-\xff[:alpha:]]"
|
||||
"[G-\x7f\x80-\x9a[:alpha:]]"
|
||||
"[Ü-\x3fff7f\x80-\xf2[:alpha:]]"))))
|
||||
|
||||
(ert-deftest rx-any ()
|
||||
(should (equal (rx (any ?A (?C . ?D) "F-H" "J-L" "M" "N-P" "Q" "RS"))
|
||||
|
@ -175,7 +191,10 @@
|
|||
"[a[:space:][:digit:]]"))
|
||||
(should (equal (rx (not "\n") (not ?\n) (not (any "\n")) (not-char ?\n)
|
||||
(| (not (in "a\n")) (not (char ?\n (?b . ?b)))))
|
||||
".....")))
|
||||
"....."))
|
||||
(should (equal (rx (or (in "g-k") (in "a-f") (or ?r (in "i-m" "n-q"))))
|
||||
"[a-r]"))
|
||||
)
|
||||
|
||||
(ert-deftest rx-pcase ()
|
||||
(should (equal (pcase "i18n" ((rx (let x (+ digit))) (list 'ok x)))
|
||||
|
@ -392,7 +411,16 @@
|
|||
(should (equal (rx (or (not (in "abc")) (not (char "bcd"))))
|
||||
"[^bc]"))
|
||||
(should (equal (rx (or "x" (? "yz")))
|
||||
"x\\|\\(?:yz\\)?")))
|
||||
"x\\|\\(?:yz\\)?"))
|
||||
(should (equal (rx (or anychar (not anychar)))
|
||||
"[^z-a]"))
|
||||
(should (equal (rx (or (not (in "a-p")) (not (in "k-u"))))
|
||||
"[^k-p]"))
|
||||
(should (equal (rx (or (not (in "a-p")) word (not (in "k-u"))))
|
||||
"[\0-jq-\x3fff7f\x80-\xff[:word:]]"))
|
||||
(should (equal (rx (or (in "a-f" blank) (in "c-z") blank))
|
||||
"[a-z[:blank:]]"))
|
||||
)
|
||||
|
||||
(ert-deftest rx-def-in-charset-or ()
|
||||
(rx-let ((a (any "badc"))
|
||||
|
@ -613,52 +641,52 @@
|
|||
|
||||
;;; unit tests for internal functions
|
||||
|
||||
(ert-deftest rx--complement-intervals ()
|
||||
(should (equal (rx--complement-intervals '())
|
||||
(ert-deftest rx--interval-set-complement ()
|
||||
(should (equal (rx--interval-set-complement '())
|
||||
'((0 . #x3fffff))))
|
||||
(should (equal (rx--complement-intervals '((10 . 20) (30 . 40)))
|
||||
(should (equal (rx--interval-set-complement '((10 . 20) (30 . 40)))
|
||||
'((0 . 9) (21 . 29) (41 . #x3fffff))))
|
||||
(should (equal (rx--complement-intervals '((0 . #x3fffff)))
|
||||
(should (equal (rx--interval-set-complement '((0 . #x3fffff)))
|
||||
'()))
|
||||
(should (equal (rx--complement-intervals
|
||||
(should (equal (rx--interval-set-complement
|
||||
'((0 . 10) (20 . 20) (30 . #x3fffff)))
|
||||
'((11 . 19) (21 . 29)))))
|
||||
|
||||
(ert-deftest rx--union-intervals ()
|
||||
(should (equal (rx--union-intervals '() '()) '()))
|
||||
(should (equal (rx--union-intervals '() '((10 . 20) (30 . 40)))
|
||||
(ert-deftest rx--interval-set-union ()
|
||||
(should (equal (rx--interval-set-union '() '()) '()))
|
||||
(should (equal (rx--interval-set-union '() '((10 . 20) (30 . 40)))
|
||||
'((10 . 20) (30 . 40))))
|
||||
(should (equal (rx--union-intervals '((10 . 20) (30 . 40)) '())
|
||||
(should (equal (rx--interval-set-union '((10 . 20) (30 . 40)) '())
|
||||
'((10 . 20) (30 . 40))))
|
||||
(should (equal (rx--union-intervals '((5 . 15) (18 . 24) (32 . 40))
|
||||
(should (equal (rx--interval-set-union '((5 . 15) (18 . 24) (32 . 40))
|
||||
'((10 . 20) (30 . 40) (50 . 60)))
|
||||
'((5 . 24) (30 . 40) (50 . 60))))
|
||||
(should (equal (rx--union-intervals '((10 . 20) (30 . 40) (50 . 60))
|
||||
(should (equal (rx--interval-set-union '((10 . 20) (30 . 40) (50 . 60))
|
||||
'((0 . 9) (21 . 29) (41 . 50)))
|
||||
'((0 . 60))))
|
||||
(should (equal (rx--union-intervals '((10 . 20) (30 . 40))
|
||||
(should (equal (rx--interval-set-union '((10 . 20) (30 . 40))
|
||||
'((12 . 18) (28 . 42)))
|
||||
'((10 . 20) (28 . 42))))
|
||||
(should (equal (rx--union-intervals '((10 . 20) (30 . 40))
|
||||
(should (equal (rx--interval-set-union '((10 . 20) (30 . 40))
|
||||
'((0 . #x3fffff)))
|
||||
'((0 . #x3fffff)))))
|
||||
|
||||
(ert-deftest rx--intersect-intervals ()
|
||||
(should (equal (rx--intersect-intervals '() '()) '()))
|
||||
(should (equal (rx--intersect-intervals '() '((10 . 20) (30 . 40)))
|
||||
(ert-deftest rx--interval-set-intersection ()
|
||||
(should (equal (rx--interval-set-intersection '() '()) '()))
|
||||
(should (equal (rx--interval-set-intersection '() '((10 . 20) (30 . 40)))
|
||||
'()))
|
||||
(should (equal (rx--intersect-intervals '((10 . 20) (30 . 40)) '())
|
||||
(should (equal (rx--interval-set-intersection '((10 . 20) (30 . 40)) '())
|
||||
'()))
|
||||
(should (equal (rx--intersect-intervals '((5 . 15) (18 . 24) (32 . 40))
|
||||
(should (equal (rx--interval-set-intersection '((5 . 15) (18 . 24) (32 . 40))
|
||||
'((10 . 20) (30 . 40) (50 . 60)))
|
||||
'((10 . 15) (18 . 20) (32 . 40))))
|
||||
(should (equal (rx--intersect-intervals '((10 . 20) (30 . 40) (50 . 60))
|
||||
(should (equal (rx--interval-set-intersection '((10 . 20) (30 . 40) (50 . 60))
|
||||
'((0 . 9) (21 . 29) (41 . 50)))
|
||||
'((50 . 50))))
|
||||
(should (equal (rx--intersect-intervals '((10 . 20) (30 . 40))
|
||||
(should (equal (rx--interval-set-intersection '((10 . 20) (30 . 40))
|
||||
'((12 . 18) (28 . 42)))
|
||||
'((12 . 18) (30 . 40))))
|
||||
(should (equal (rx--intersect-intervals '((10 . 20) (30 . 40))
|
||||
(should (equal (rx--interval-set-intersection '((10 . 20) (30 . 40))
|
||||
'((0 . #x3fffff)))
|
||||
'((10 . 20) (30 . 40)))))
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue