Initial revision
This commit is contained in:
parent
55fd504836
commit
e65f17fee9
2 changed files with 1614 additions and 0 deletions
965
lisp/emacs-lisp/sregex.el
Normal file
965
lisp/emacs-lisp/sregex.el
Normal file
|
@ -0,0 +1,965 @@
|
|||
;;; sregex.el --- symbolic regular expressions
|
||||
|
||||
;; Copyright (C) 1997, 1998 Free Software Foundation, Inc.
|
||||
|
||||
;; Author: Bob Glickstein <bobg+sregex@zanshin.com>
|
||||
;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com>
|
||||
|
||||
;; This file is part of GNU Emacs.
|
||||
|
||||
;; GNU Emacs is free software; you can redistribute it and/or modify
|
||||
;; it under the terms of the GNU General Public License as published by
|
||||
;; the Free Software Foundation; either version 2, or (at your option)
|
||||
;; any later version.
|
||||
|
||||
;; GNU Emacs is distributed in the hope that it will be useful,
|
||||
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;; GNU General Public License for more details.
|
||||
|
||||
;; You should have received a copy of the GNU General Public License
|
||||
;; along with GNU Emacs; see the file COPYING. If not, write to the
|
||||
;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
||||
;; Boston, MA 02111-1307, USA.
|
||||
|
||||
;;; Commentary:
|
||||
|
||||
;; This package allows you to write regular expressions using a
|
||||
;; totally new, Lisp-like syntax.
|
||||
|
||||
;; A "symbolic regular expression" (sregex for short) is a Lisp form
|
||||
;; that, when evaluated, produces the string form of the specified
|
||||
;; regular expression. Here's a simple example:
|
||||
|
||||
;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert"
|
||||
|
||||
;; As you can see, an sregex is specified by placing one or more
|
||||
;; special clauses in a call to `sregexq'. The clause in this case is
|
||||
;; the `or' of two strings (not to be confused with the Lisp function
|
||||
;; `or'). The list of allowable clauses appears below.
|
||||
|
||||
;; With sregex, it is never necessary to "escape" magic characters
|
||||
;; that are meant to be taken literally; that happens automatically.
|
||||
;; For example:
|
||||
|
||||
;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H"
|
||||
|
||||
;; It is also unnecessary to "group" parts of the expression together
|
||||
;; to overcome operator precedence; that also happens automatically.
|
||||
;; For example:
|
||||
|
||||
;; (sregexq (opt (or "Bob" "Robert"))) => "\\(Bob\\|Robert\\)?"
|
||||
|
||||
;; It *is* possible to group parts of the expression in order to refer
|
||||
;; to them with numbered backreferences:
|
||||
|
||||
;; (sregexq (group (or "Go" "Run"))
|
||||
;; ", Spot, "
|
||||
;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1"
|
||||
|
||||
;; If `sregexq' needs to introduce its own grouping parentheses, it
|
||||
;; will automatically renumber your backreferences:
|
||||
|
||||
;; (sregexq (opt "resent-")
|
||||
;; (group (or "to" "cc" "bcc"))
|
||||
;; ": "
|
||||
;; (backref 1)) => "\\(resent-\\)?\\(to\\|cc\\|bcc\\): \\2"
|
||||
|
||||
;; `sregexq' is a macro. Each time it is used, it constructs a simple
|
||||
;; Lisp expression that then invokes a moderately complex engine to
|
||||
;; interpret the sregex and render the string form. Because of this,
|
||||
;; I don't recommend sprinkling calls to `sregexq' throughout your
|
||||
;; code, the way one normally does with string regexes (which are
|
||||
;; cheap to evaluate). Instead, it's wiser to precompute the regexes
|
||||
;; you need wherever possible instead of repeatedly constructing the
|
||||
;; same ones over and over. Example:
|
||||
|
||||
;; (let ((field-regex (sregexq (opt "resent-")
|
||||
;; (or "to" "cc" "bcc"))))
|
||||
;; ...
|
||||
;; (while ...
|
||||
;; ...
|
||||
;; (re-search-forward field-regex ...)
|
||||
;; ...))
|
||||
|
||||
;; The arguments to `sregexq' are automatically quoted, but the
|
||||
;; flipside of this is that it is not straightforward to include
|
||||
;; computed (i.e., non-constant) values in `sregexq' expressions. So
|
||||
;; `sregex' is a function that is like `sregexq' but which does not
|
||||
;; automatically quote its values. Literal sregex clauses must be
|
||||
;; explicitly quoted like so:
|
||||
|
||||
;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert"
|
||||
|
||||
;; but computed clauses can be included easily, allowing for the reuse
|
||||
;; of common clauses:
|
||||
|
||||
;; (let ((dotstar '(0+ any))
|
||||
;; (whitespace '(1+ (syntax ?-)))
|
||||
;; (digits '(1+ (char (?0 . ?9)))))
|
||||
;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+"
|
||||
|
||||
;; This package also provides sregex-specific versions of the Emacs
|
||||
;; functions `replace-match', `match-string',
|
||||
;; `match-string-no-properties', `match-beginning', `match-end', and
|
||||
;; `match-data'. In each case, the sregex version's name begins with
|
||||
;; `sregex-' and takes one additional optional parameter, an sregex
|
||||
;; "info" object. Each of these functions is concerned with numbered
|
||||
;; submatches. Since sregex may renumber submatches, alternate
|
||||
;; versions of these functions are needed that know how to adjust the
|
||||
;; supplied number.
|
||||
|
||||
;; The sregex info object for the most recently evaluated sregex can
|
||||
;; be obtained with `sregex-info'; so if you precompute your sregexes
|
||||
;; and you plan to use `replace-match' or one of the others with it,
|
||||
;; you need to record the info object for later use:
|
||||
|
||||
;; (let* ((regex (sregexq (opt "resent-")
|
||||
;; (group (or "to" "cc" "bcc"))
|
||||
;; ":"))
|
||||
;; (regex-info (sregex-info)))
|
||||
;; ...
|
||||
;; (if (re-search-forward regex ...)
|
||||
;; (let ((which (sregex-match-string 1 nil regex-info)))
|
||||
;; ...)))
|
||||
|
||||
;; In this example, `regex' is "\\(resent-\\)?\\(to\\|cc\\|bcc\\):",
|
||||
;; so the call to (sregex-match-string 1 ...) is automatically turned
|
||||
;; into a call to (match-string 2 ...).
|
||||
|
||||
;; If the sregex info argument to `sregex-replace-match',
|
||||
;; `sregex-match-string', `sregex-match-string-no-properties',
|
||||
;; `sregex-match-beginning', `sregex-match-end', or
|
||||
;; `sregex-match-data' is omitted, the current value of (sregex-info)
|
||||
;; is used.
|
||||
|
||||
;; You can do your own sregex submatch renumbering with
|
||||
;; `sregex-backref-num'.
|
||||
|
||||
;; Finally, `sregex-save-match-data' is like `save-match-data' but
|
||||
;; also saves and restores the information maintained by
|
||||
;; `sregex-info'.
|
||||
|
||||
;; To use this package in a Lisp program, simply (require 'sregex).
|
||||
|
||||
;; Here are the clauses allowed in an `sregex' or `sregexq'
|
||||
;; expression:
|
||||
|
||||
;; - a string
|
||||
;; This stands for the literal string. If it contains
|
||||
;; metacharacters, they will be escaped in the resulting regex
|
||||
;; (using `regexp-quote').
|
||||
|
||||
;; - the symbol `any'
|
||||
;; This stands for ".", a regex matching any character except
|
||||
;; newline.
|
||||
|
||||
;; - the symbol `bol'
|
||||
;; Stands for "^", matching the empty string at the beginning of a line
|
||||
|
||||
;; - the symbol `eol'
|
||||
;; Stands for "$", matching the empty string at the end of a line
|
||||
|
||||
;; - (group CLAUSE ...)
|
||||
;; Groups the given CLAUSEs using "\\(" and "\\)".
|
||||
|
||||
;; - (sequence CLAUSE ...)
|
||||
|
||||
;; Groups the given CLAUSEs; may or may not use "\\(" and "\\)".
|
||||
;; Clauses groups by `sequence' do not count for purposes of
|
||||
;; numbering backreferences. Use `sequence' in situations like
|
||||
;; this:
|
||||
|
||||
;; (sregexq (or "dog" "cat"
|
||||
;; (sequence (opt "sea ") "monkey")))
|
||||
;; => "dog\\|cat\\|\\(sea \\)?monkey"
|
||||
|
||||
;; where a single `or' alternate needs to contain multiple
|
||||
;; subclauses.
|
||||
|
||||
;; - (backref N)
|
||||
;; Matches the same string previously matched by the Nth "group" in
|
||||
;; the same sregex. N is a positive integer. In the resulting
|
||||
;; regex, N may be adjusted to account for automatically introduced
|
||||
;; groups.
|
||||
|
||||
;; - (or CLAUSE ...)
|
||||
;; Matches any one of the CLAUSEs by separating them with "\\|".
|
||||
|
||||
;; - (0+ CLAUSE ...)
|
||||
;; Concatenates the given CLAUSEs and matches zero or more
|
||||
;; occurrences by appending "*".
|
||||
|
||||
;; - (1+ CLAUSE ...)
|
||||
;; Concatenates the given CLAUSEs and matches one or more
|
||||
;; occurrences by appending "+".
|
||||
|
||||
;; - (opt CLAUSE ...)
|
||||
;; Concatenates the given CLAUSEs and matches zero or one occurrence
|
||||
;; by appending "?".
|
||||
|
||||
;; - (repeat MIN MAX CLAUSE ...)
|
||||
;; Concatenates the given CLAUSEs and constructs a regex matching at
|
||||
;; least MIN occurrences and at most MAX occurrences. MIN must be a
|
||||
;; non-negative integer. MAX must be a non-negative integer greater
|
||||
;; than or equal to MIN; or MAX can be nil to mean "infinity."
|
||||
|
||||
;; - (char CHAR-CLAUSE ...)
|
||||
;; Creates a "character class" matching one character from the given
|
||||
;; set. See below for how to construct a CHAR-CLAUSE.
|
||||
|
||||
;; - (not-char CHAR-CLAUSE ...)
|
||||
;; Creates a "character class" matching any one character not in the
|
||||
;; given set. See below for how to construct a CHAR-CLAUSE.
|
||||
|
||||
;; - the symbol `bot'
|
||||
;; Stands for "\\`", matching the empty string at the beginning of
|
||||
;; text (beginning of a string or of a buffer).
|
||||
|
||||
;; - the symbol `eot'
|
||||
;; Stands for "\\'", matching the empty string at the end of text.
|
||||
|
||||
;; - the symbol `point'
|
||||
;; Stands for "\\=", matching the empty string at point.
|
||||
|
||||
;; - the symbol `word-boundary'
|
||||
;; Stands for "\\b", matching the empty string at the beginning or
|
||||
;; end of a word.
|
||||
|
||||
;; - the symbol `not-word-boundary'
|
||||
;; Stands for "\\B", matching the empty string not at the beginning
|
||||
;; or end of a word.
|
||||
|
||||
;; - the symbol `bow'
|
||||
;; Stands for "\\<", matching the empty string at the beginning of a
|
||||
;; word.
|
||||
|
||||
;; - the symbol `eow'
|
||||
;; Stands for "\\>", matching the empty string at the end of a word.
|
||||
|
||||
;; - the symbol `wordchar'
|
||||
;; Stands for the regex "\\w", matching a word-constituent character
|
||||
;; (as determined by the current syntax table)
|
||||
|
||||
;; - the symbol `not-wordchar'
|
||||
;; Stands for the regex "\\W", matching a non-word-constituent
|
||||
;; character.
|
||||
|
||||
;; - (syntax CODE)
|
||||
;; Stands for the regex "\\sCODE", where CODE is a syntax table code
|
||||
;; (a single character). Matches any character with the requested
|
||||
;; syntax.
|
||||
|
||||
;; - (not-syntax CODE)
|
||||
;; Stands for the regex "\\SCODE", where CODE is a syntax table code
|
||||
;; (a single character). Matches any character without the
|
||||
;; requested syntax.
|
||||
|
||||
;; - (regex REGEX)
|
||||
;; This is a "trapdoor" for including ordinary regular expression
|
||||
;; strings in the result. Some regular expressions are clearer when
|
||||
;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for
|
||||
;; instance. However, see the note under "Bugs," below.
|
||||
|
||||
;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
|
||||
;; has one of the following forms:
|
||||
|
||||
;; - a character
|
||||
;; Adds that character to the set.
|
||||
|
||||
;; - a string
|
||||
;; Adds all the characters in the string to the set.
|
||||
|
||||
;; - A pair (MIN . MAX)
|
||||
;; Where MIN and MAX are characters, adds the range of characters
|
||||
;; from MIN through MAX to the set.
|
||||
|
||||
;;; To do:
|
||||
|
||||
;; Make (sregexq (or "a" (sequence "b" "c"))) return "a\\|bc" instead
|
||||
;; of "a\\|\\(bc\\)"
|
||||
|
||||
;; An earlier version of this package could optionally translate the
|
||||
;; symbolic regex into other languages' syntaxes, e.g. Perl. For
|
||||
;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would
|
||||
;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore
|
||||
;; such a facility.
|
||||
|
||||
;;; Bugs:
|
||||
|
||||
;; The (regex REGEX) form can confuse the code that distinguishes
|
||||
;; introduced groups from user-specified groups. Try to avoid using
|
||||
;; grouping within a `regex' form. Failing that, try to avoid using
|
||||
;; backrefs if you're using `regex'.
|
||||
|
||||
;;; Code:
|
||||
|
||||
(defsubst sregex--value-unitp (val) (nth 0 val))
|
||||
(defsubst sregex--value-groups (val) (nth 1 val))
|
||||
(defsubst sregex--value-tree (val) (nth 2 val))
|
||||
|
||||
(defun sregex--make-value (unitp groups tree)
|
||||
(list unitp groups tree))
|
||||
|
||||
(defvar sregex--current-sregex nil
|
||||
"Global state for `sregex-info'.")
|
||||
|
||||
(defun sregex-info ()
|
||||
"Return extra information about the latest call to `sregex'.
|
||||
This extra information is needed in order to adjust user-requested
|
||||
backreference numbers to numbers suitable for the generated regexp.
|
||||
See e.g. `sregex-match-string' and `sregex-backref-num'."
|
||||
sregex--current-sregex)
|
||||
|
||||
; (require 'advice)
|
||||
; (defadvice save-match-data (around sregex-save-match-data protect)
|
||||
; (let ((sregex--saved-sregex sregex--current-sregex))
|
||||
; (unwind-protect
|
||||
; ad-do-it
|
||||
; (setq sregex--current-sregex sregex--saved-sregex))))
|
||||
(defmacro sregex-save-match-data (&rest forms)
|
||||
"Like `save-match-data', but also saves and restores `sregex-info' data."
|
||||
`(let ((sregex--saved-sregex sregex--current-sregex))
|
||||
(unwind-protect
|
||||
(save-match-data ,@forms)
|
||||
(setq sregex--current-sregex sregex--saved-sregex))))
|
||||
|
||||
(defun sregex-replace-match (replacement
|
||||
&optional fixedcase literal string subexp sregex)
|
||||
"Like `replace-match', for a regexp made with `sregex'.
|
||||
This takes one additional optional argument, the `sregex' info, which
|
||||
can be obtained with `sregex-info'. The SUBEXP argument is adjusted
|
||||
to allow for \"introduced groups\". If the extra argument is omitted
|
||||
or nil, it defaults to the current value of (sregex-info)."
|
||||
(replace-match replacement fixedcase literal string
|
||||
(and subexp
|
||||
(sregex-backref-num
|
||||
subexp
|
||||
(sregex--value-groups (or sregex
|
||||
sregex--current-sregex))))))
|
||||
|
||||
(defun sregex-match-string (count &optional in-string sregex)
|
||||
"Like `match-string', for a regexp made with `sregex'.
|
||||
This takes one additional optional argument, the `sregex' info, which
|
||||
can be obtained with `sregex-info'. The COUNT argument is adjusted to
|
||||
allow for \"introduced groups\". If the extra argument is omitted or
|
||||
nil, it defaults to the current value of (sregex-info)."
|
||||
(match-string (and count
|
||||
(sregex-backref-num
|
||||
count
|
||||
(sregex--value-groups (or sregex
|
||||
sregex--current-sregex))))
|
||||
in-string))
|
||||
|
||||
(defun sregex-match-string-no-properties (count &optional in-string sregex)
|
||||
"Like `match-string-no-properties', for a regexp made with `sregex'.
|
||||
This takes one additional optional argument, the `sregex' info, which
|
||||
can be obtained with `sregex-info'. The COUNT argument is adjusted to
|
||||
allow for \"introduced groups\". If the extra argument is omitted or
|
||||
nil, it defaults to the current value of (sregex-info)."
|
||||
(match-string-no-properties
|
||||
(and count
|
||||
(sregex-backref-num
|
||||
count
|
||||
(sregex--value-groups (or sregex
|
||||
sregex--current-sregex))))
|
||||
in-string))
|
||||
|
||||
(defun sregex-match-beginning (count &optional sregex)
|
||||
"Like `match-beginning', for a regexp made with `sregex'.
|
||||
This takes one additional optional argument, the `sregex' info, which
|
||||
can be obtained with `sregex-info'. The COUNT argument is adjusted to
|
||||
allow for \"introduced groups\". If the extra argument is omitted or
|
||||
nil, it defaults to the current value of (sregex-info)."
|
||||
(match-beginning (sregex-backref-num
|
||||
count
|
||||
(sregex--value-groups (or sregex
|
||||
sregex--current-sregex)))))
|
||||
|
||||
(defun sregex-match-end (count &optional sregex)
|
||||
"Like `match-end', for a regexp made with `sregex'.
|
||||
This takes one additional optional argument, the `sregex' info, which
|
||||
can be obtained with `sregex-info'. The COUNT argument is adjusted to
|
||||
allow for \"introduced groups\". If the extra argument is omitted or
|
||||
nil, it defaults to the current value of (sregex-info)."
|
||||
(match-end (sregex-backref-num
|
||||
count
|
||||
(sregex--value-groups (or sregex
|
||||
sregex--current-sregex)))))
|
||||
|
||||
(defun sregex-match-data (&optional sregex)
|
||||
"Like `match-data', for a regexp made with `sregex'.
|
||||
This takes one additional optional argument, the `sregex' info, which
|
||||
can be obtained with `sregex-info'. \"Introduced groups\" are removed
|
||||
from the result. If the extra argument is omitted or nil, it defaults
|
||||
to the current value of (sregex-info)."
|
||||
(let* ((data (match-data))
|
||||
(groups (sregex--value-groups (or sregex
|
||||
sregex--current-sregex)))
|
||||
(result (list (car (cdr data))
|
||||
(car data))))
|
||||
(setq data (cdr (cdr data)))
|
||||
(while data
|
||||
(if (car groups)
|
||||
(setq result (append (list (car (cdr data))
|
||||
(car data))
|
||||
result)))
|
||||
(setq groups (cdr groups)
|
||||
data (cdr (cdr data))))
|
||||
(reverse result)))
|
||||
|
||||
(defun sregex--render-tree (tree sregex)
|
||||
(let ((key (car tree)))
|
||||
(cond ((eq key 'str)
|
||||
(cdr tree))
|
||||
((eq key 'or)
|
||||
(mapconcat '(lambda (x)
|
||||
(sregex--render-tree x sregex))
|
||||
(cdr tree)
|
||||
"\\|"))
|
||||
((eq key 'sequence)
|
||||
(apply 'concat
|
||||
(mapcar '(lambda (x)
|
||||
(sregex--render-tree x sregex))
|
||||
(cdr tree))))
|
||||
((eq key 'group)
|
||||
(concat "\\("
|
||||
(sregex--render-tree (cdr tree) sregex)
|
||||
"\\)"))
|
||||
((eq key 'opt)
|
||||
(concat (sregex--render-tree (cdr tree) sregex)
|
||||
"?"))
|
||||
((eq key '0+)
|
||||
(concat (sregex--render-tree (cdr tree) sregex)
|
||||
"*"))
|
||||
((eq key '1+)
|
||||
(concat (sregex--render-tree (cdr tree) sregex)
|
||||
"+"))
|
||||
((eq key 'backref)
|
||||
(let ((num (sregex-backref-num (cdr tree) sregex)))
|
||||
(if (> num 9)
|
||||
(error "sregex: backref number %d too high after adjustment"
|
||||
num)
|
||||
(concat "\\" (int-to-string num)))))
|
||||
(t (error "sregex internal error: unknown tree type %S"
|
||||
key)))))
|
||||
|
||||
(defun sregex (&rest exps)
|
||||
"Symbolic regular expression interpreter.
|
||||
This is exactly like `sregexq' (q.v.) except that it evaluates all its
|
||||
arguments, so literal sregex clauses must be quoted. For example:
|
||||
|
||||
(sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
|
||||
|
||||
An argument-evaluating sregex interpreter lets you reuse sregex
|
||||
subexpressions:
|
||||
|
||||
(let ((dotstar '(0+ any))
|
||||
(whitespace '(1+ (syntax ?-)))
|
||||
(digits '(1+ (char (?0 . ?9)))))
|
||||
(sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\""
|
||||
(progn
|
||||
(setq sregex--current-sregex (sregex--sequence exps nil))
|
||||
(sregex--render-tree (sregex--value-tree sregex--current-sregex)
|
||||
sregex--current-sregex)))
|
||||
|
||||
(defmacro sregexq (&rest exps)
|
||||
"Symbolic regular expression interpreter.
|
||||
This macro allows you to specify a regular expression (regexp) in
|
||||
symbolic form, and converts it into the string form required by Emacs's
|
||||
regex functions such as `re-search-forward' and `looking-at'. Here is
|
||||
a simple example:
|
||||
|
||||
(sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
|
||||
|
||||
As you can see, an sregex is specified by placing one or more special
|
||||
clauses in a call to `sregexq'. The clause in this case is the `or'
|
||||
of two strings (not to be confused with the Lisp function `or'). The
|
||||
list of allowable clauses appears below.
|
||||
|
||||
With `sregex', it is never necessary to \"escape\" magic characters
|
||||
that are meant to be taken literally; that happens automatically.
|
||||
For example:
|
||||
|
||||
(sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\"
|
||||
|
||||
It is also unnecessary to \"group\" parts of the expression together
|
||||
to overcome operator precedence; that also happens automatically.
|
||||
For example:
|
||||
|
||||
(sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\"
|
||||
|
||||
It *is* possible to group parts of the expression in order to refer
|
||||
to them with numbered backreferences:
|
||||
|
||||
(sregexq (group (or \"Go\" \"Run\"))
|
||||
\", Spot, \"
|
||||
(backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\"
|
||||
|
||||
If `sregexq' needs to introduce its own grouping parentheses, it will
|
||||
automatically renumber your backreferences:
|
||||
|
||||
(sregexq (opt \"resent-\")
|
||||
(group (or \"to\" \"cc\" \"bcc\"))
|
||||
\": \"
|
||||
(backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\"
|
||||
|
||||
`sregexq' is a macro. Each time it is used, it constructs a simple
|
||||
Lisp expression that then invokes a moderately complex engine to
|
||||
interpret the sregex and render the string form. Because of this, I
|
||||
don't recommend sprinkling calls to `sregexq' throughout your code,
|
||||
the way one normally does with string regexes (which are cheap to
|
||||
evaluate). Instead, it's wiser to precompute the regexes you need
|
||||
wherever possible instead of repeatedly constructing the same ones
|
||||
over and over. Example:
|
||||
|
||||
(let ((field-regex (sregexq (opt \"resent-\")
|
||||
(or \"to\" \"cc\" \"bcc\"))))
|
||||
...
|
||||
(while ...
|
||||
...
|
||||
(re-search-forward field-regex ...)
|
||||
...))
|
||||
|
||||
The arguments to `sregexq' are automatically quoted, but the
|
||||
flipside of this is that it is not straightforward to include
|
||||
computed (i.e., non-constant) values in `sregexq' expressions. So
|
||||
`sregex' is a function that is like `sregexq' but which does not
|
||||
automatically quote its values. Literal sregex clauses must be
|
||||
explicitly quoted like so:
|
||||
|
||||
(sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
|
||||
|
||||
but computed clauses can be included easily, allowing for the reuse
|
||||
of common clauses:
|
||||
|
||||
(let ((dotstar '(0+ any))
|
||||
(whitespace '(1+ (syntax ?-)))
|
||||
(digits '(1+ (char (?0 . ?9)))))
|
||||
(sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"
|
||||
|
||||
Here are the clauses allowed in an `sregex' or `sregexq' expression:
|
||||
|
||||
- a string
|
||||
This stands for the literal string. If it contains
|
||||
metacharacters, they will be escaped in the resulting regex
|
||||
(using `regexp-quote').
|
||||
|
||||
- the symbol `any'
|
||||
This stands for \".\", a regex matching any character except
|
||||
newline.
|
||||
|
||||
- the symbol `bol'
|
||||
Stands for \"^\", matching the empty string at the beginning of a line
|
||||
|
||||
- the symbol `eol'
|
||||
Stands for \"$\", matching the empty string at the end of a line
|
||||
|
||||
- (group CLAUSE ...)
|
||||
Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\".
|
||||
|
||||
- (sequence CLAUSE ...)
|
||||
|
||||
Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\".
|
||||
Clauses groups by `sequence' do not count for purposes of
|
||||
numbering backreferences. Use `sequence' in situations like
|
||||
this:
|
||||
|
||||
(sregexq (or \"dog\" \"cat\"
|
||||
(sequence (opt \"sea \") \"monkey\")))
|
||||
=> \"dog\\\\|cat\\\\|\\\\(sea \\\\)?monkey\"
|
||||
|
||||
where a single `or' alternate needs to contain multiple
|
||||
subclauses.
|
||||
|
||||
- (backref N)
|
||||
Matches the same string previously matched by the Nth \"group\" in
|
||||
the same sregex. N is a positive integer. In the resulting
|
||||
regex, N may be adjusted to account for automatically introduced
|
||||
groups.
|
||||
|
||||
- (or CLAUSE ...)
|
||||
Matches any one of the CLAUSEs by separating them with \"\\\\|\".
|
||||
|
||||
- (0+ CLAUSE ...)
|
||||
Concatenates the given CLAUSEs and matches zero or more
|
||||
occurrences by appending \"*\".
|
||||
|
||||
- (1+ CLAUSE ...)
|
||||
Concatenates the given CLAUSEs and matches one or more
|
||||
occurrences by appending \"+\".
|
||||
|
||||
- (opt CLAUSE ...)
|
||||
Concatenates the given CLAUSEs and matches zero or one occurrence
|
||||
by appending \"?\".
|
||||
|
||||
- (repeat MIN MAX CLAUSE ...)
|
||||
Concatenates the given CLAUSEs and constructs a regex matching at
|
||||
least MIN occurrences and at most MAX occurrences. MIN must be a
|
||||
non-negative integer. MAX must be a non-negative integer greater
|
||||
than or equal to MIN; or MAX can be nil to mean \"infinity.\"
|
||||
|
||||
- (char CHAR-CLAUSE ...)
|
||||
Creates a \"character class\" matching one character from the given
|
||||
set. See below for how to construct a CHAR-CLAUSE.
|
||||
|
||||
- (not-char CHAR-CLAUSE ...)
|
||||
Creates a \"character class\" matching any one character not in the
|
||||
given set. See below for how to construct a CHAR-CLAUSE.
|
||||
|
||||
- the symbol `bot'
|
||||
Stands for \"\\\\`\", matching the empty string at the beginning of
|
||||
text (beginning of a string or of a buffer).
|
||||
|
||||
- the symbol `eot'
|
||||
Stands for \"\\\\'\", matching the empty string at the end of text.
|
||||
|
||||
- the symbol `point'
|
||||
Stands for \"\\\\=\", matching the empty string at point.
|
||||
|
||||
- the symbol `word-boundary'
|
||||
Stands for \"\\\\b\", matching the empty string at the beginning or
|
||||
end of a word.
|
||||
|
||||
- the symbol `not-word-boundary'
|
||||
Stands for \"\\\\B\", matching the empty string not at the beginning
|
||||
or end of a word.
|
||||
|
||||
- the symbol `bow'
|
||||
Stands for \"\\\\\\=<\", matching the empty string at the beginning of a
|
||||
word.
|
||||
|
||||
- the symbol `eow'
|
||||
Stands for \"\\\\\\=>\", matching the empty string at the end of a word.
|
||||
|
||||
- the symbol `wordchar'
|
||||
Stands for the regex \"\\\\w\", matching a word-constituent character
|
||||
(as determined by the current syntax table)
|
||||
|
||||
- the symbol `not-wordchar'
|
||||
Stands for the regex \"\\\\W\", matching a non-word-constituent
|
||||
character.
|
||||
|
||||
- (syntax CODE)
|
||||
Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code
|
||||
(a single character). Matches any character with the requested
|
||||
syntax.
|
||||
|
||||
- (not-syntax CODE)
|
||||
Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code
|
||||
(a single character). Matches any character without the
|
||||
requested syntax.
|
||||
|
||||
- (regex REGEX)
|
||||
This is a \"trapdoor\" for including ordinary regular expression
|
||||
strings in the result. Some regular expressions are clearer when
|
||||
written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for
|
||||
instance. However, using this can confuse the code that
|
||||
distinguishes introduced groups from user-specified groups. Avoid
|
||||
using grouping within a `regex' form. Failing that, avoid using
|
||||
backrefs if you're using `regex'.
|
||||
|
||||
Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
|
||||
has one of the following forms:
|
||||
|
||||
- a character
|
||||
Adds that character to the set.
|
||||
|
||||
- a string
|
||||
Adds all the characters in the string to the set.
|
||||
|
||||
- A pair (MIN . MAX)
|
||||
Where MIN and MAX are characters, adds the range of characters
|
||||
from MIN through MAX to the set."
|
||||
`(apply 'sregex ',exps))
|
||||
|
||||
(defun sregex--engine (exp combine)
|
||||
(let* ((val (cond ((stringp exp)
|
||||
(sregex--make-value (or (not (eq combine 'suffix))
|
||||
(= (length exp) 1))
|
||||
nil
|
||||
(cons 'str
|
||||
(regexp-quote exp))))
|
||||
((symbolp exp)
|
||||
(funcall (intern (concat "sregex--"
|
||||
(symbol-name exp)))
|
||||
combine))
|
||||
((consp exp)
|
||||
(funcall (intern (concat "sregex--"
|
||||
(symbol-name (car exp))))
|
||||
(cdr exp)
|
||||
combine))
|
||||
(t (error "Invalid expression: %s" exp))))
|
||||
(unitp (sregex--value-unitp val))
|
||||
(groups (sregex--value-groups val))
|
||||
(tree (sregex--value-tree val)))
|
||||
(if (and combine (not unitp))
|
||||
(sregex--make-value t
|
||||
(cons nil groups)
|
||||
(cons 'group tree))
|
||||
(sregex--make-value unitp groups tree))))
|
||||
|
||||
(defun sregex--sequence (exps combine)
|
||||
(if (= (length exps) 1)
|
||||
(sregex--engine (car exps) combine)
|
||||
(let ((groups nil)
|
||||
(trees nil)) ;grows in reverse
|
||||
(while exps
|
||||
(let ((val (sregex--engine (car exps) 'concat)))
|
||||
(setq groups (append groups
|
||||
(sregex--value-groups val))
|
||||
trees (cons (sregex--value-tree val) trees)
|
||||
exps (cdr exps))))
|
||||
(setq trees (nreverse trees))
|
||||
(if (eq combine 'suffix)
|
||||
(sregex--make-value t
|
||||
(cons nil groups)
|
||||
(cons 'group
|
||||
(cons 'sequence trees)))
|
||||
(sregex--make-value (not (eq combine 'suffix))
|
||||
groups
|
||||
(cons 'sequence trees))))))
|
||||
|
||||
(defun sregex--group (exps combine)
|
||||
(let ((val (sregex--sequence exps nil)))
|
||||
(sregex--make-value t
|
||||
(cons t (sregex--value-groups val))
|
||||
(cons 'group (sregex--value-tree val)))))
|
||||
|
||||
(defun sregex-backref-num (n &optional sregex)
|
||||
"Adjust backreference number N according to SREGEX.
|
||||
When `sregex' introduces parenthesized groups that the user didn't ask
|
||||
for, the numbering of the groups that the user *did* ask for gets all
|
||||
out of whack. This function accounts for introduced groups. Example:
|
||||
|
||||
(sregexq (opt \"ab\")
|
||||
(group (or \"c\" \"d\"))) => \"\\\\(ab\\\\)?\\\\(c\\\\|d\\\\)\"
|
||||
(setq info (sregex-info))
|
||||
(sregex-backref-num 1 info) => 2
|
||||
|
||||
The SREGEX parameter is optional and defaults to the current value of
|
||||
`sregex-info'."
|
||||
(let ((groups (sregex--value-groups (or sregex
|
||||
sregex--current-sregex)))
|
||||
(result 0))
|
||||
(while (and groups (> n 0))
|
||||
(if (car groups)
|
||||
(setq n (1- n)))
|
||||
(setq result (1+ result)
|
||||
groups (cdr groups)))
|
||||
result))
|
||||
|
||||
(defun sregex--backref (exps combine)
|
||||
(sregex--make-value t nil (cons 'backref (car exps))))
|
||||
|
||||
(defun sregex--any (combine)
|
||||
(sregex--make-value t nil '(str . ".")))
|
||||
|
||||
(defun sregex--opt (exps combine)
|
||||
(let ((val (sregex--sequence exps 'suffix)))
|
||||
(sregex--make-value t
|
||||
(sregex--value-groups val)
|
||||
(cons 'opt (sregex--value-tree val)))))
|
||||
|
||||
(defun sregex--0+ (exps combine)
|
||||
(let ((val (sregex--sequence exps 'suffix)))
|
||||
(sregex--make-value t
|
||||
(sregex--value-groups val)
|
||||
(cons '0+ (sregex--value-tree val)))))
|
||||
(defun sregex--1+ (exps combine)
|
||||
(let ((val (sregex--sequence exps 'suffix)))
|
||||
(sregex--make-value t
|
||||
(sregex--value-groups val)
|
||||
(cons '1+ (sregex--value-tree val)))))
|
||||
|
||||
(defun sregex--repeat (exps combine)
|
||||
(let ((min (or (car exps) 0))
|
||||
(max (car (cdr exps))))
|
||||
(setq exps (cdr (cdr exps)))
|
||||
(cond ((zerop min)
|
||||
(cond ((equal max 0) ;degenerate
|
||||
(sregex--make-value t nil nil))
|
||||
((equal max 1)
|
||||
(sregex--opt exps combine))
|
||||
((not max)
|
||||
(sregex--0+ exps combine))
|
||||
(t (sregex--sequence (make-list max
|
||||
(cons 'opt exps))
|
||||
combine))))
|
||||
((= min 1)
|
||||
(cond ((equal max 1)
|
||||
(sregex--sequence exps combine))
|
||||
((not max)
|
||||
(sregex--1+ exps combine))
|
||||
(t (sregex--sequence (append exps
|
||||
(make-list (1- max)
|
||||
(cons 'opt exps)))
|
||||
combine))))
|
||||
(t (sregex--sequence (append exps
|
||||
(list (append (list 'repeat
|
||||
(1- min)
|
||||
(and max
|
||||
(1- max)))
|
||||
exps)))
|
||||
combine)))))
|
||||
|
||||
(defun sregex--or (exps combine)
|
||||
(if (= (length exps) 1)
|
||||
(sregex--engine (car exps) combine)
|
||||
(let ((groups nil)
|
||||
(trees nil))
|
||||
(while exps
|
||||
(let ((val (sregex--engine (car exps) 'or)))
|
||||
(setq groups (append groups
|
||||
(sregex--value-groups val))
|
||||
trees (cons (sregex--value-tree val) trees)
|
||||
exps (cdr exps))))
|
||||
(sregex--make-value (eq combine 'or)
|
||||
groups
|
||||
(cons 'or (nreverse trees))))))
|
||||
|
||||
(defmacro sregex--char-range-aux ()
|
||||
'(if start
|
||||
(let (startc endc)
|
||||
(if (and (<= 32 start)
|
||||
(<= start 127))
|
||||
(setq startc (char-to-string start)
|
||||
endc (char-to-string end))
|
||||
(setq startc (format "\\%03o" start)
|
||||
endc (format "\\%03o" end)))
|
||||
(if (> end start)
|
||||
(if (> end (+ start 1))
|
||||
(setq class (concat class startc "-" endc))
|
||||
(setq class (concat class startc endc)))
|
||||
(setq class (concat class startc))))))
|
||||
|
||||
(defmacro sregex--char-range (rstart rend)
|
||||
`(let ((i ,rstart)
|
||||
start end)
|
||||
(while (<= i ,rend)
|
||||
(if (aref chars i)
|
||||
(progn
|
||||
(if start
|
||||
(setq end i)
|
||||
(setq start i
|
||||
end i))
|
||||
(aset chars i nil))
|
||||
(sregex--char-range-aux)
|
||||
(setq start nil
|
||||
end nil))
|
||||
(setq i (1+ i)))
|
||||
(sregex--char-range-aux)))
|
||||
|
||||
(defun sregex--char-aux (complement args)
|
||||
(let ((chars (make-vector 256 nil)))
|
||||
(while args
|
||||
(let ((arg (car args)))
|
||||
(cond ((integerp arg)
|
||||
(aset chars arg t))
|
||||
((stringp arg)
|
||||
(mapcar (function
|
||||
(lambda (c)
|
||||
(aset chars c t)))
|
||||
arg))
|
||||
((consp arg)
|
||||
(let ((start (car arg))
|
||||
(end (cdr arg)))
|
||||
(if (> start end)
|
||||
(let ((tmp start))
|
||||
(setq start end
|
||||
end tmp)))
|
||||
;; now start <= end
|
||||
(let ((i start))
|
||||
(while (<= i end)
|
||||
(aset chars i t)
|
||||
(setq i (1+ i))))))))
|
||||
(setq args (cdr args)))
|
||||
;; now chars is a map of the characters in the class
|
||||
(let ((class "")
|
||||
(caret (aref chars ?^)))
|
||||
(aset chars ?^ nil)
|
||||
(if (aref chars ?\])
|
||||
(progn
|
||||
(setq class (concat class "]"))
|
||||
(aset chars ?\] nil)))
|
||||
(if (aref chars ?-)
|
||||
(progn
|
||||
(setq class (concat class "-"))
|
||||
(aset chars ?- nil)))
|
||||
(if (aref chars ?\\)
|
||||
(progn
|
||||
(setq class (concat class "\\\\"))
|
||||
(aset chars ?\\ nil)))
|
||||
|
||||
(sregex--char-range ?A ?Z)
|
||||
(sregex--char-range ?a ?z)
|
||||
(sregex--char-range ?0 ?9)
|
||||
|
||||
(let ((i 32))
|
||||
(while (< i 128)
|
||||
(if (aref chars i)
|
||||
(progn
|
||||
(setq class (concat class (char-to-string i)))
|
||||
(aset chars i nil)))
|
||||
(setq i (1+ i))))
|
||||
|
||||
(sregex--char-range 0 31)
|
||||
(sregex--char-range 128 255)
|
||||
|
||||
(let ((i 0))
|
||||
(while (< i 256)
|
||||
(if (aref chars i)
|
||||
(setq class (concat class (format "\\%03o" i))))
|
||||
(setq i (1+ i))))
|
||||
|
||||
(if caret
|
||||
(setq class (concat class "^")))
|
||||
(concat "[" (if complement "^") class "]"))))
|
||||
|
||||
(defun sregex--char (exps combine)
|
||||
(sregex--make-value t nil (cons 'str (sregex--char-aux nil exps))))
|
||||
(defun sregex--not-char (exps combine)
|
||||
(sregex--make-value t nil (cons 'str (sregex--char-aux t exps))))
|
||||
|
||||
(defun sregex--bol (combine)
|
||||
(sregex--make-value t nil '(str . "^")))
|
||||
(defun sregex--eol (combine)
|
||||
(sregex--make-value t nil '(str . "$")))
|
||||
|
||||
(defun sregex--wordchar (combine)
|
||||
(sregex--make-value t nil '(str . "\\w")))
|
||||
(defun sregex--not-wordchar (combine)
|
||||
(sregex--make-value t nil '(str . "\\W")))
|
||||
|
||||
(defun sregex--syntax (exps combine)
|
||||
(sregex--make-value t nil (cons 'str (format "\\s%c" (car exps)))))
|
||||
(defun sregex--not-syntax (exps combine)
|
||||
(sregex--make-value t nil (cons 'str (format "\\S%c" (car exps)))))
|
||||
|
||||
(defun sregex--bot (combine)
|
||||
(sregex--make-value t nil (cons 'str "\\`")))
|
||||
(defun sregex--eot (combine)
|
||||
(sregex--make-value t nil (cons 'str "\\'")))
|
||||
|
||||
(defun sregex--point (combine)
|
||||
(sregex--make-value t nil '(str . "\\=")))
|
||||
|
||||
(defun sregex--word-boundary (combine)
|
||||
(sregex--make-value t nil '(str . "\\b")))
|
||||
(defun sregex--not-word-boundary (combine)
|
||||
(sregex--make-value t nil '(str . "\\B")))
|
||||
|
||||
(defun sregex--bow (combine)
|
||||
(sregex--make-value t nil '(str . "\\<")))
|
||||
(defun sregex--eow (combine)
|
||||
(sregex--make-value t nil '(str . "\\>")))
|
||||
|
||||
|
||||
;; trapdoor - usage discouraged
|
||||
(defun sregex--regex (exps combine)
|
||||
(sregex--make-value nil nil (car exps)))
|
||||
|
||||
(provide 'sregex)
|
||||
|
||||
;;; sregex.el ends here
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue