2015-06-30 19:45:56 -07:00
|
|
|
|
;;; character-fold.el --- match unicode to similar ASCII -*- lexical-binding: t; -*-
|
2015-06-24 20:01:10 +01:00
|
|
|
|
|
|
|
|
|
;; Copyright (C) 2015 Free Software Foundation, Inc.
|
|
|
|
|
|
|
|
|
|
;; Maintainer: emacs-devel@gnu.org
|
|
|
|
|
;; Keywords: matching
|
|
|
|
|
|
|
|
|
|
;; This file is part of GNU Emacs.
|
|
|
|
|
|
|
|
|
|
;; GNU Emacs is free software: you can redistribute it and/or modify
|
|
|
|
|
;; it under the terms of the GNU General Public License as published by
|
|
|
|
|
;; the Free Software Foundation, either version 3 of the License, or
|
|
|
|
|
;; (at your option) any later version.
|
|
|
|
|
|
|
|
|
|
;; GNU Emacs is distributed in the hope that it will be useful,
|
|
|
|
|
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
;; GNU General Public License for more details.
|
|
|
|
|
|
|
|
|
|
;; You should have received a copy of the GNU General Public License
|
|
|
|
|
;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
|
|
;;; Code:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
(defconst character-fold-table
|
|
|
|
|
(eval-when-compile
|
2015-11-28 10:32:46 +00:00
|
|
|
|
(let ((equiv (make-char-table 'character-fold-table))
|
|
|
|
|
(table (unicode-property-table-internal 'decomposition)))
|
2015-06-25 18:53:33 +01:00
|
|
|
|
;; Ensure the table is populated.
|
2015-11-28 10:32:46 +00:00
|
|
|
|
(let ((func (char-table-extra-slot table 1)))
|
|
|
|
|
(map-char-table (lambda (char v)
|
|
|
|
|
(when (consp char)
|
|
|
|
|
(funcall func (car char) v table)))
|
|
|
|
|
table))
|
2015-06-25 02:52:02 +01:00
|
|
|
|
|
2015-06-24 20:01:10 +01:00
|
|
|
|
;; Compile a list of all complex characters that each simple
|
|
|
|
|
;; character should match.
|
|
|
|
|
(map-char-table
|
2015-11-27 12:00:37 +00:00
|
|
|
|
(lambda (char decomp)
|
|
|
|
|
(when (consp decomp)
|
2015-11-27 12:01:12 +00:00
|
|
|
|
(if (symbolp (car decomp))
|
|
|
|
|
;; Discard a possible formatting tag.
|
|
|
|
|
(setq decomp (cdr decomp))
|
|
|
|
|
;; If there's no formatting tag, ensure that char matches
|
|
|
|
|
;; its decomp exactly. This is because we want 'ä' to
|
|
|
|
|
;; match 'ä', but we don't want '¹' to match '1'.
|
|
|
|
|
(aset equiv char
|
|
|
|
|
(cons (apply #'string decomp)
|
|
|
|
|
(aref equiv char))))
|
2015-11-27 12:00:37 +00:00
|
|
|
|
;; Finally, figure out whether char has a simpler
|
|
|
|
|
;; equivalent (char-aux). If so, ensure that char-aux
|
|
|
|
|
;; matches char and maybe its decomp too.
|
|
|
|
|
|
2015-06-30 19:45:56 -07:00
|
|
|
|
;; Skip trivial cases like ?a decomposing to (?a).
|
2015-11-27 12:00:37 +00:00
|
|
|
|
(unless (or (and (eq char (car decomp))
|
|
|
|
|
(not (cdr decomp))))
|
|
|
|
|
(let ((dec-aux decomp)
|
2015-07-05 16:44:22 +01:00
|
|
|
|
(fold-decomp t)
|
2015-11-27 12:00:37 +00:00
|
|
|
|
char-aux found)
|
|
|
|
|
(while (and dec-aux (not found))
|
|
|
|
|
(setq char-aux (pop dec-aux))
|
|
|
|
|
;; Is char-aux a number or letter, per unicode standard?
|
|
|
|
|
(setq found (memq (get-char-code-property char-aux 'general-category)
|
2015-06-24 20:01:10 +01:00
|
|
|
|
'(Lu Ll Lt Lm Lo Nd Nl No))))
|
|
|
|
|
(if found
|
2015-11-27 12:00:37 +00:00
|
|
|
|
;; Check if the decomp has more than one letter,
|
|
|
|
|
;; because then we don't want the first letter to
|
|
|
|
|
;; match the decomposition. This is because we
|
|
|
|
|
;; want 'f' to match 'ff' but not 'ff'.
|
|
|
|
|
(dolist (char-aux dec-aux)
|
2015-07-05 16:44:22 +01:00
|
|
|
|
(when (and fold-decomp
|
2015-11-27 12:00:37 +00:00
|
|
|
|
(memq (get-char-code-property char-aux 'general-category)
|
2015-07-05 16:44:22 +01:00
|
|
|
|
'(Lu Ll Lt Lm Lo Nd Nl No)))
|
|
|
|
|
(setq fold-decomp nil)))
|
2015-06-24 20:01:10 +01:00
|
|
|
|
;; If there's no number or letter on the
|
2015-11-27 12:00:37 +00:00
|
|
|
|
;; decomp, take the first character in it.
|
|
|
|
|
(setq found (car-safe decomp)))
|
|
|
|
|
;; Finally, we only fold multi-char decomp if at
|
2015-07-05 16:44:22 +01:00
|
|
|
|
;; least one of the chars is non-spacing (combining).
|
|
|
|
|
(when fold-decomp
|
|
|
|
|
(setq fold-decomp nil)
|
2015-11-27 12:00:37 +00:00
|
|
|
|
(dolist (char-aux decomp)
|
2015-07-05 16:44:22 +01:00
|
|
|
|
(when (and (not fold-decomp)
|
2015-11-27 12:00:37 +00:00
|
|
|
|
(> (get-char-code-property char-aux 'canonical-combining-class) 0))
|
2015-07-05 16:44:22 +01:00
|
|
|
|
(setq fold-decomp t))))
|
2015-11-27 12:00:37 +00:00
|
|
|
|
;; Add char to the list of characters that char-aux can
|
|
|
|
|
;; represent. Also possibly add its decomp, so we can
|
2015-06-24 20:01:10 +01:00
|
|
|
|
;; match multi-char representations like (format "a%c" 769)
|
2015-11-27 12:00:37 +00:00
|
|
|
|
(when (and found (not (eq char char-aux)))
|
|
|
|
|
(let ((chars (cons (char-to-string char) (aref equiv char-aux))))
|
|
|
|
|
(aset equiv char-aux
|
2015-07-05 16:44:22 +01:00
|
|
|
|
(if fold-decomp
|
2015-11-27 12:00:37 +00:00
|
|
|
|
(cons (apply #'string decomp) chars)
|
2015-07-05 16:44:22 +01:00
|
|
|
|
chars))))))))
|
2015-06-25 18:53:33 +01:00
|
|
|
|
table)
|
|
|
|
|
|
|
|
|
|
;; Add some manual entries.
|
2015-06-24 20:01:10 +01:00
|
|
|
|
(dolist (it '((?\" """ "“" "”" "”" "„" "⹂" "〞" "‟" "‟" "❞" "❝" "❠" "“" "„" "〝" "〟" "🙷" "🙶" "🙸" "«" "»")
|
|
|
|
|
(?' "❟" "❛" "❜" "‘" "’" "‚" "‛" "‚" "" "❮" "❯" "‹" "›")
|
2015-08-05 18:51:34 +01:00
|
|
|
|
(?` "❛" "‘" "‛" "" "❮" "‹")))
|
2015-06-24 20:01:10 +01:00
|
|
|
|
(let ((idx (car it))
|
|
|
|
|
(chars (cdr it)))
|
|
|
|
|
(aset equiv idx (append chars (aref equiv idx)))))
|
2015-06-25 18:53:33 +01:00
|
|
|
|
|
|
|
|
|
;; Convert the lists of characters we compiled into regexps.
|
2015-06-24 20:01:10 +01:00
|
|
|
|
(map-char-table
|
2015-11-27 12:00:37 +00:00
|
|
|
|
(lambda (char dec-list)
|
|
|
|
|
(let ((re (regexp-opt (cons (char-to-string char) dec-list))))
|
|
|
|
|
(if (consp char)
|
|
|
|
|
(set-char-table-range equiv char re)
|
|
|
|
|
(aset equiv char re))))
|
2015-06-24 20:01:10 +01:00
|
|
|
|
equiv)
|
|
|
|
|
equiv))
|
|
|
|
|
"Used for folding characters of the same group during search.")
|
|
|
|
|
|
2015-10-28 15:50:17 +00:00
|
|
|
|
(defun character-fold--make-space-string (n)
|
|
|
|
|
"Return a string that matches N spaces."
|
|
|
|
|
(format "\\(?:%s\\|%s\\)"
|
|
|
|
|
(make-string n ?\s)
|
|
|
|
|
(apply #'concat
|
|
|
|
|
(make-list n (or (aref character-fold-table ?\s) " ")))))
|
|
|
|
|
|
2015-06-24 20:01:10 +01:00
|
|
|
|
;;;###autoload
|
2015-10-25 01:43:23 +01:00
|
|
|
|
(defun character-fold-to-regexp (string &optional _lax)
|
2015-06-24 20:01:10 +01:00
|
|
|
|
"Return a regexp matching anything that character-folds into STRING.
|
2015-10-25 01:43:23 +01:00
|
|
|
|
Any character in STRING that has an entry in
|
2015-06-24 20:01:10 +01:00
|
|
|
|
`character-fold-table' is replaced with that entry (which is a
|
2015-10-25 01:43:23 +01:00
|
|
|
|
regexp) and other characters are `regexp-quote'd."
|
2015-11-28 10:32:46 +00:00
|
|
|
|
(let ((spaces 0)
|
|
|
|
|
(i 0)
|
|
|
|
|
(end (length string))
|
|
|
|
|
(out nil))
|
2015-11-27 09:54:27 +02:00
|
|
|
|
;; When the user types a space, we want to match the table entry
|
|
|
|
|
;; for ?\s, which is generally a regexp like "[ ...]". However,
|
|
|
|
|
;; the `search-spaces-regexp' variable doesn't "see" spaces inside
|
|
|
|
|
;; these regexp constructs, so we need to use "\\( \\|[ ...]\\)"
|
|
|
|
|
;; instead (to manually expose a space). Furthermore, the lax
|
|
|
|
|
;; search engine acts on a bunch of spaces, not on individual
|
|
|
|
|
;; spaces, so if the string contains sequential spaces like " ", we
|
|
|
|
|
;; need to keep them grouped together like this: "\\( \\|[ ...][ ...]\\)".
|
2015-11-28 10:32:46 +00:00
|
|
|
|
(while (< i end)
|
|
|
|
|
(pcase (aref string i)
|
|
|
|
|
(`?\s (setq spaces (1+ spaces)))
|
|
|
|
|
(c (when (> spaces 0)
|
|
|
|
|
(push (character-fold--make-space-string spaces) out)
|
|
|
|
|
(setq spaces 0))
|
|
|
|
|
(push (or (aref character-fold-table c)
|
|
|
|
|
(regexp-quote (string c)))
|
|
|
|
|
out)))
|
|
|
|
|
(setq i (1+ i)))
|
|
|
|
|
(when (> spaces 0)
|
|
|
|
|
(push (character-fold--make-space-string spaces) out))
|
|
|
|
|
(apply #'concat (nreverse out))))
|
2015-10-25 01:43:23 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
;;; Commands provided for completeness.
|
|
|
|
|
(defun character-fold-search-forward (string &optional bound noerror count)
|
|
|
|
|
"Search forward for a character-folded version of STRING.
|
|
|
|
|
STRING is converted to a regexp with `character-fold-to-regexp',
|
|
|
|
|
which is searched for with `re-search-forward'.
|
|
|
|
|
BOUND NOERROR COUNT are passed to `re-search-forward'."
|
|
|
|
|
(interactive "sSearch: ")
|
|
|
|
|
(re-search-forward (character-fold-to-regexp string) bound noerror count))
|
|
|
|
|
|
|
|
|
|
(defun character-fold-search-backward (string &optional bound noerror count)
|
|
|
|
|
"Search backward for a character-folded version of STRING.
|
|
|
|
|
STRING is converted to a regexp with `character-fold-to-regexp',
|
|
|
|
|
which is searched for with `re-search-backward'.
|
|
|
|
|
BOUND NOERROR COUNT are passed to `re-search-backward'."
|
|
|
|
|
(interactive "sSearch: ")
|
|
|
|
|
(re-search-backward (character-fold-to-regexp string) bound noerror count))
|
2015-06-24 20:01:10 +01:00
|
|
|
|
|
2015-10-30 14:49:52 +01:00
|
|
|
|
(provide 'character-fold)
|
|
|
|
|
|
2015-06-24 21:25:01 -04:00
|
|
|
|
;;; character-fold.el ends here
|