Add textsec-domain-suspicious-p

* .gitignore: Ignore idna-mapping.el.

* admin/notes/unicode: Note idna-mapping file.
* admin/unidata/IdnaMappingTable.txt: New file.

* admin/unidata/Makefile.in (all): Generate idna-mapping.el.

* admin/unidata/unidata-gen.el (unidata-gen-idna-mapping): Generate.

* lisp/international/textsec.el (textsec-domain-suspicious-p): New
function.
This commit is contained in:
Lars Ingebrigtsen 2022-01-18 11:53:01 +01:00
parent 702ce8dc3e
commit 9f25c41ad4
10 changed files with 15348 additions and 9 deletions

1
.gitignore vendored
View file

@ -221,6 +221,7 @@ lisp/international/emoji-zwj.el
lisp/international/emoji-labels.el
lisp/international/eucjp-ms.el
lisp/international/uni-*.el
lisp/international/idna-mapping.el
lisp/language/pinyin.el
# Documentation.

View file

@ -26,13 +26,15 @@ Emacs uses the following files from the Unicode Character Database
. BidiCharacterTest.txt
Emacs also uses the file emoji-test.txt which should be imported from
the Unicode's Public/emoji/ directory.
the Unicode's Public/emoji/ directory, and IdnaMappingTable.txt from
the Public/idna/ directory.
First, the first 14 files and emoji-test.txt need to be copied into
admin/unidata/, and the file https://www.unicode.org/copyright.html
should be copied over copyright.html in admin/unidata (some of them
might need trailing whitespace removed before they can be committed to
the Emacs repository).
First, the first 14 files, emoji-test.txt and IdnaMappingTable.txt
need to be copied into admin/unidata/, and the file
https://www.unicode.org/copyright.html should be copied over
copyright.html in admin/unidata (some of them might need trailing
whitespace removed before they can be committed to the Emacs
repository).
Then Emacs should be rebuilt for them to take effect. Rebuilding
Emacs updates several derived files elsewhere in the Emacs source

File diff suppressed because it is too large Load diff

View file

@ -45,7 +45,8 @@ unifiles = $(addprefix ${unidir}/,$(sort $(shell sed -n 's/^[ \t][ \t]*${lparen}
all: ${top_srcdir}/src/macuvs.h ${unifiles} ${unidir}/charscript.el \
${unidir}/charprop.el ${unidir}/emoji-zwj.el ${unidir}/emoji-labels.el \
${unidir}/uni-scripts.el ${unidir}/uni-confusable.el
${unidir}/uni-scripts.el ${unidir}/uni-confusable.el \
${unidir}/idna-mapping.el
## Specify .elc as an order-only prereq so as to not needlessly rebuild
## target just because the .elc is missing.
@ -95,6 +96,11 @@ ${unidir}/uni-confusable.el: ${srcdir}/unidata-gen.el \
$(AM_V_GEN)${emacs} -L ${srcdir} \
-l unidata-gen.el -f unidata-gen-confusable $@
${unidir}/idna-mapping.el: ${srcdir}/unidata-gen.el \
${srcdir}/IdnaMappingTable.txt
$(AM_V_GEN)${emacs} -L ${srcdir} \
-l unidata-gen.el -f unidata-gen-idna-mapping $@
.PHONY: charscript.el
charscript.el: ${unidir}/charscript.el

View file

@ -61,6 +61,6 @@ PropertyValueAliases.txt
https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt
2022-01-17
confusables.txt
https://www.unicode.org/Public/security/latest/confusables.txt
IdnaMappingTable.txt
https://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
2022-01-18

View file

@ -1602,6 +1602,46 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)."
(insert ")")
(unidata-gen-charprop file (buffer-string)))))
(defun unidata-gen-idna-mapping (&optional file)
;; Running from Makefile.
(unless file
(setq file (pop command-line-args-left)))
(let ((map (make-char-table nil)))
(with-temp-buffer
(unidata-gen--insert-file "IdnaMappingTable.txt")
(while (re-search-forward "^\\([0-9A-F]+\\)\\(?:\\.\\.\\([0-9A-F]+\\)\\)? +; +\\([^ ]+\\) +\\(?:; +\\([ 0-9A-F]+\\)\\)?"
nil t)
(let ((start (match-string 1))
(end (match-string 2))
(status (match-string 3))
(mapped (match-string 4)))
;; Make reading the file slightly faster by using `t'
;; instead of `disallowed' all over the place.
(when (string-match-p "\\`disallowed" status)
(setq status "t"))
(unless (or (equal status "valid")
(equal status "deviation"))
(set-char-table-range
map
(if end
(cons (string-to-number start 16)
(string-to-number end 16))
(string-to-number start 16))
(cond
((equal status "mapped")
(apply #'string
(mapcar (lambda (char)
(string-to-number char 16))
(split-string (string-trim mapped)))))
(t
(intern status))))))))
(with-temp-buffer
(insert "(defconst idna-mapping-table\n")
(let ((print-length nil))
(prin1 map (current-buffer)))
(insert ")")
(unidata-gen-charprop file (buffer-string)))))
;;; unidata-gen.el ends here

View file

@ -26,6 +26,8 @@
(require 'cl-lib)
(require 'uni-confusable)
(require 'ucs-normalize)
(require 'idna-mapping)
(require 'puny)
(defvar textsec--char-scripts nil)
@ -222,6 +224,18 @@ STRING isn't a single script string."
(textsec-single-script-p string1)
(textsec-single-script-p string2)))
(defun textsec-domain-suspicious-p (domain)
(catch 'found
(seq-do
(lambda (char)
(when (eq (elt idna-mapping-table char) t)
(throw 'found (format "Disallowed character: `%s' (#x%x)"
(string char) char))))
domain)
(unless (puny-highly-restrictive-domain-p domain)
(throw 'found "%s is not highly restrictive"))
nil))
(provide 'textsec)
;;; textsec.el ends here

View file

@ -110,4 +110,9 @@
(should-not (textsec-whole-script-confusable-p "paypal" "pаypаl"))
(should (textsec-whole-script-confusable-p "scope""ѕсоре")))
(ert-deftest test-suspiction-domain ()
(should (textsec-domain-suspicious-p "foo/bar.org"))
(should-not (textsec-domain-suspicious-p "foo.org"))
(should (textsec-domain-suspicious-p "f\N{LEFT-TO-RIGHT ISOLATE}oo.org")))
;;; textsec-tests.el ends here

File diff suppressed because it is too large Load diff

View file

@ -68,4 +68,10 @@
"xn--b.com-gra"))
(should (equal (puny-encode-string "Bä.com") "xn--b.com-gra")))
;;; TODO!
;; puny-resources/IdnaTestV2.txt has a bunch of tests, and they should
;; be implemented. However, the puny encoding does not fully
;; implement https://www.unicode.org/reports/tr46/#Conformance yet, so
;; it'll fail.
;;; puny-tests.el ends here