Add textsec support for confusable characters

* admin/notes/unicode: Note the confusables.txt file.
* admin/unidata/Makefile.in (${unidir}/uni-confusable.el):
Generate the confusable file.

* admin/unidata/README (https): Add confusables.txt.

* admin/unidata/confusables.txt: New file.

* admin/unidata/unidata-gen.el (unidata-gen-confusable): Parse the
confusables.txt file.

* lisp/international/textsec.el (textsec-ascii-confusable-p)
(textsec-unconfuse-string): New functions.
This commit is contained in:
Lars Ingebrigtsen 2022-01-18 09:57:43 +01:00
parent 65c9f57856
commit 19fefea1ca
7 changed files with 9702 additions and 2 deletions

View file

@ -19,6 +19,7 @@ Emacs uses the following files from the Unicode Character Database
. ScriptExtensions.txt
. Scripts.txt
. SpecialCasing.txt
. confusables.txt
. emoji-data.txt
. emoji-zwj-sequences.txt
. emoji-sequences.txt
@ -27,7 +28,7 @@ Emacs uses the following files from the Unicode Character Database
Emacs also uses the file emoji-test.txt which should be imported from
the Unicode's Public/emoji/ directory.
First, the first 13 files and emoji-test.txt need to be copied into
First, the first 14 files and emoji-test.txt need to be copied into
admin/unidata/, and the file https://www.unicode.org/copyright.html
should be copied over copyright.html in admin/unidata (some of them
might need trailing whitespace removed before they can be committed to

View file

@ -45,7 +45,7 @@ unifiles = $(addprefix ${unidir}/,$(sort $(shell sed -n 's/^[ \t][ \t]*${lparen}
all: ${top_srcdir}/src/macuvs.h ${unifiles} ${unidir}/charscript.el \
${unidir}/charprop.el ${unidir}/emoji-zwj.el ${unidir}/emoji-labels.el \
${unidir}/uni-scripts.el
${unidir}/uni-scripts.el ${unidir}/uni-confusable.el
## Specify .elc as an order-only prereq so as to not needlessly rebuild
## target just because the .elc is missing.
@ -90,6 +90,11 @@ ${unidir}/uni-scripts.el: ${srcdir}/unidata-gen.el \
$(AM_V_GEN)${emacs} -L ${srcdir} \
-l unidata-gen.el -f unidata-gen-scripts $@
${unidir}/uni-confusable.el: ${srcdir}/unidata-gen.el \
${srcdir}/confusables.txt
$(AM_V_GEN)${emacs} -L ${srcdir} \
-l unidata-gen.el -f unidata-gen-confusable $@
.PHONY: charscript.el
charscript.el: ${unidir}/charscript.el

View file

@ -60,3 +60,7 @@ https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt
PropertyValueAliases.txt
https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt
2022-01-17
confusables.txt
https://www.unicode.org/Public/security/latest/confusables.txt
2022-01-18

File diff suppressed because it is too large Load diff

View file

@ -1576,6 +1576,32 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)."
(expand-file-name (concat "../admin/unidata/" name)
data-directory)))
(defun unidata-gen-confusable (&optional file)
;; Running from Makefile.
(unless file
(setq file (pop command-line-args-left)))
(let ((regexp
(concat "^\\([[:xdigit:]]+\\)"
"[ \t]*;[ \t]*"
"\\([[:space:][:xdigit:]]+\\)"
"[ \t]*;"))
(map (make-hash-table)))
(with-temp-buffer
(unidata-gen--insert-file "confusables.txt")
(while (re-search-forward regexp nil t)
(let ((from (match-string 1))
(to (string-trim (match-string 2))))
(setf (gethash (string-to-number from 16) map)
(apply #'string (mapcar (lambda (string)
(string-to-number string 16))
(split-string to)))))))
(with-temp-buffer
(insert "(defconst uni-confusable-table\n")
(let ((print-length nil))
(prin1 map (current-buffer)))
(insert ")")
(unidata-gen-charprop file (buffer-string)))))
;;; unidata-gen.el ends here

View file

@ -24,6 +24,8 @@
;;; Code:
(require 'cl-lib)
(require 'uni-confusable)
(require 'ucs-normalize)
(defvar textsec--char-scripts nil)
@ -172,6 +174,24 @@ Levels are (in decreasing order of restrictiveness) `ascii-only',
string))))
1))
(defun textsec-ascii-confusable-p (string)
"Return non-nil if STRING isn't ASCII, but is confusable with ASCII."
(and (not (eq (textsec-restriction-level string) 'ascii-only))
(eq (textsec-restriction-level (textsec-unconfuse-string string))
'ascii-only)))
(defun textsec-unconfuse-string (string)
"Return a de-confused version of STRING.
This algorithm is described in:
https://www.unicode.org/reports/tr39/#Confusable_Detection"
(ucs-normalize-NFD-string
(apply #'concat
(seq-map (lambda (char)
(or (gethash char uni-confusable-table)
(string char)))
(ucs-normalize-NFD-string string)))))
(provide 'textsec)
;;; textsec.el ends here

View file

@ -86,4 +86,11 @@
(should-not (textsec-mixed-numbers-p "8foo8"))
(should (textsec-mixed-numbers-p "8foo")))
(ert-deftest test-confusable ()
(should (equal (textsec-unconfuse-string "ljeto") "ljeto"))
(should (textsec-ascii-confusable-p "ljeto"))
(should-not (textsec-ascii-confusable-p "ljeto"))
(should (equal (textsec-unconfuse-string "") ""))
(should-not (textsec-ascii-confusable-p "")))
;;; textsec-tests.el ends here