Add textsec support for confusable characters
* admin/notes/unicode: Note the confusables.txt file. * admin/unidata/Makefile.in (${unidir}/uni-confusable.el): Generate the confusable file. * admin/unidata/README (https): Add confusables.txt. * admin/unidata/confusables.txt: New file. * admin/unidata/unidata-gen.el (unidata-gen-confusable): Parse the confusables.txt file. * lisp/international/textsec.el (textsec-ascii-confusable-p) (textsec-unconfuse-string): New functions.
This commit is contained in:
parent
65c9f57856
commit
19fefea1ca
7 changed files with 9702 additions and 2 deletions
|
@ -19,6 +19,7 @@ Emacs uses the following files from the Unicode Character Database
|
|||
. ScriptExtensions.txt
|
||||
. Scripts.txt
|
||||
. SpecialCasing.txt
|
||||
. confusables.txt
|
||||
. emoji-data.txt
|
||||
. emoji-zwj-sequences.txt
|
||||
. emoji-sequences.txt
|
||||
|
@ -27,7 +28,7 @@ Emacs uses the following files from the Unicode Character Database
|
|||
Emacs also uses the file emoji-test.txt which should be imported from
|
||||
the Unicode's Public/emoji/ directory.
|
||||
|
||||
First, the first 13 files and emoji-test.txt need to be copied into
|
||||
First, the first 14 files and emoji-test.txt need to be copied into
|
||||
admin/unidata/, and the file https://www.unicode.org/copyright.html
|
||||
should be copied over copyright.html in admin/unidata (some of them
|
||||
might need trailing whitespace removed before they can be committed to
|
||||
|
|
|
@ -45,7 +45,7 @@ unifiles = $(addprefix ${unidir}/,$(sort $(shell sed -n 's/^[ \t][ \t]*${lparen}
|
|||
|
||||
all: ${top_srcdir}/src/macuvs.h ${unifiles} ${unidir}/charscript.el \
|
||||
${unidir}/charprop.el ${unidir}/emoji-zwj.el ${unidir}/emoji-labels.el \
|
||||
${unidir}/uni-scripts.el
|
||||
${unidir}/uni-scripts.el ${unidir}/uni-confusable.el
|
||||
|
||||
## Specify .elc as an order-only prereq so as to not needlessly rebuild
|
||||
## target just because the .elc is missing.
|
||||
|
@ -90,6 +90,11 @@ ${unidir}/uni-scripts.el: ${srcdir}/unidata-gen.el \
|
|||
$(AM_V_GEN)${emacs} -L ${srcdir} \
|
||||
-l unidata-gen.el -f unidata-gen-scripts $@
|
||||
|
||||
${unidir}/uni-confusable.el: ${srcdir}/unidata-gen.el \
|
||||
${srcdir}/confusables.txt
|
||||
$(AM_V_GEN)${emacs} -L ${srcdir} \
|
||||
-l unidata-gen.el -f unidata-gen-confusable $@
|
||||
|
||||
.PHONY: charscript.el
|
||||
charscript.el: ${unidir}/charscript.el
|
||||
|
||||
|
|
|
@ -60,3 +60,7 @@ https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt
|
|||
PropertyValueAliases.txt
|
||||
https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt
|
||||
2022-01-17
|
||||
|
||||
confusables.txt
|
||||
https://www.unicode.org/Public/security/latest/confusables.txt
|
||||
2022-01-18
|
||||
|
|
9637
admin/unidata/confusables.txt
Normal file
9637
admin/unidata/confusables.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1576,6 +1576,32 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)."
|
|||
(expand-file-name (concat "../admin/unidata/" name)
|
||||
data-directory)))
|
||||
|
||||
(defun unidata-gen-confusable (&optional file)
|
||||
;; Running from Makefile.
|
||||
(unless file
|
||||
(setq file (pop command-line-args-left)))
|
||||
(let ((regexp
|
||||
(concat "^\\([[:xdigit:]]+\\)"
|
||||
"[ \t]*;[ \t]*"
|
||||
"\\([[:space:][:xdigit:]]+\\)"
|
||||
"[ \t]*;"))
|
||||
(map (make-hash-table)))
|
||||
(with-temp-buffer
|
||||
(unidata-gen--insert-file "confusables.txt")
|
||||
(while (re-search-forward regexp nil t)
|
||||
(let ((from (match-string 1))
|
||||
(to (string-trim (match-string 2))))
|
||||
(setf (gethash (string-to-number from 16) map)
|
||||
(apply #'string (mapcar (lambda (string)
|
||||
(string-to-number string 16))
|
||||
(split-string to)))))))
|
||||
(with-temp-buffer
|
||||
(insert "(defconst uni-confusable-table\n")
|
||||
(let ((print-length nil))
|
||||
(prin1 map (current-buffer)))
|
||||
(insert ")")
|
||||
(unidata-gen-charprop file (buffer-string)))))
|
||||
|
||||
|
||||
|
||||
;;; unidata-gen.el ends here
|
||||
|
|
|
@ -24,6 +24,8 @@
|
|||
;;; Code:
|
||||
|
||||
(require 'cl-lib)
|
||||
(require 'uni-confusable)
|
||||
(require 'ucs-normalize)
|
||||
|
||||
(defvar textsec--char-scripts nil)
|
||||
|
||||
|
@ -172,6 +174,24 @@ Levels are (in decreasing order of restrictiveness) `ascii-only',
|
|||
string))))
|
||||
1))
|
||||
|
||||
(defun textsec-ascii-confusable-p (string)
|
||||
"Return non-nil if STRING isn't ASCII, but is confusable with ASCII."
|
||||
(and (not (eq (textsec-restriction-level string) 'ascii-only))
|
||||
(eq (textsec-restriction-level (textsec-unconfuse-string string))
|
||||
'ascii-only)))
|
||||
|
||||
(defun textsec-unconfuse-string (string)
|
||||
"Return a de-confused version of STRING.
|
||||
This algorithm is described in:
|
||||
|
||||
https://www.unicode.org/reports/tr39/#Confusable_Detection"
|
||||
(ucs-normalize-NFD-string
|
||||
(apply #'concat
|
||||
(seq-map (lambda (char)
|
||||
(or (gethash char uni-confusable-table)
|
||||
(string char)))
|
||||
(ucs-normalize-NFD-string string)))))
|
||||
|
||||
(provide 'textsec)
|
||||
|
||||
;;; textsec.el ends here
|
||||
|
|
|
@ -86,4 +86,11 @@
|
|||
(should-not (textsec-mixed-numbers-p "8foo8"))
|
||||
(should (textsec-mixed-numbers-p "8foo৪")))
|
||||
|
||||
(ert-deftest test-confusable ()
|
||||
(should (equal (textsec-unconfuse-string "ljeto") "ljeto"))
|
||||
(should (textsec-ascii-confusable-p "ljeto"))
|
||||
(should-not (textsec-ascii-confusable-p "ljeto"))
|
||||
(should (equal (textsec-unconfuse-string "~") "〜"))
|
||||
(should-not (textsec-ascii-confusable-p "~")))
|
||||
|
||||
;;; textsec-tests.el ends here
|
||||
|
|
Loading…
Add table
Reference in a new issue