Add textsec-domain-suspicious-p

* .gitignore: Ignore idna-mapping.el.

* admin/notes/unicode: Note idna-mapping file.
* admin/unidata/IdnaMappingTable.txt: New file.

* admin/unidata/Makefile.in (all): Generate idna-mapping.el.

* admin/unidata/unidata-gen.el (unidata-gen-idna-mapping): Generate.

* lisp/international/textsec.el (textsec-domain-suspicious-p): New
function.
This commit is contained in:
Lars Ingebrigtsen 2022-01-18 11:53:01 +01:00
parent 702ce8dc3e
commit 9f25c41ad4
10 changed files with 15348 additions and 9 deletions

1
.gitignore vendored
View file

@ -221,6 +221,7 @@ lisp/international/emoji-zwj.el
lisp/international/emoji-labels.el lisp/international/emoji-labels.el
lisp/international/eucjp-ms.el lisp/international/eucjp-ms.el
lisp/international/uni-*.el lisp/international/uni-*.el
lisp/international/idna-mapping.el
lisp/language/pinyin.el lisp/language/pinyin.el
# Documentation. # Documentation.

View file

@ -26,13 +26,15 @@ Emacs uses the following files from the Unicode Character Database
. BidiCharacterTest.txt . BidiCharacterTest.txt
Emacs also uses the file emoji-test.txt which should be imported from Emacs also uses the file emoji-test.txt which should be imported from
the Unicode's Public/emoji/ directory. the Unicode's Public/emoji/ directory, and IdnaMappingTable.txt from
the Public/idna/ directory.
First, the first 14 files and emoji-test.txt need to be copied into First, the first 14 files, emoji-test.txt and IdnaMappingTable.txt
admin/unidata/, and the file https://www.unicode.org/copyright.html need to be copied into admin/unidata/, and the file
should be copied over copyright.html in admin/unidata (some of them https://www.unicode.org/copyright.html should be copied over
might need trailing whitespace removed before they can be committed to copyright.html in admin/unidata (some of them might need trailing
the Emacs repository). whitespace removed before they can be committed to the Emacs
repository).
Then Emacs should be rebuilt for them to take effect. Rebuilding Then Emacs should be rebuilt for them to take effect. Rebuilding
Emacs updates several derived files elsewhere in the Emacs source Emacs updates several derived files elsewhere in the Emacs source

File diff suppressed because it is too large Load diff

View file

@ -45,7 +45,8 @@ unifiles = $(addprefix ${unidir}/,$(sort $(shell sed -n 's/^[ \t][ \t]*${lparen}
all: ${top_srcdir}/src/macuvs.h ${unifiles} ${unidir}/charscript.el \ all: ${top_srcdir}/src/macuvs.h ${unifiles} ${unidir}/charscript.el \
${unidir}/charprop.el ${unidir}/emoji-zwj.el ${unidir}/emoji-labels.el \ ${unidir}/charprop.el ${unidir}/emoji-zwj.el ${unidir}/emoji-labels.el \
${unidir}/uni-scripts.el ${unidir}/uni-confusable.el ${unidir}/uni-scripts.el ${unidir}/uni-confusable.el \
${unidir}/idna-mapping.el
## Specify .elc as an order-only prereq so as to not needlessly rebuild ## Specify .elc as an order-only prereq so as to not needlessly rebuild
## target just because the .elc is missing. ## target just because the .elc is missing.
@ -95,6 +96,11 @@ ${unidir}/uni-confusable.el: ${srcdir}/unidata-gen.el \
$(AM_V_GEN)${emacs} -L ${srcdir} \ $(AM_V_GEN)${emacs} -L ${srcdir} \
-l unidata-gen.el -f unidata-gen-confusable $@ -l unidata-gen.el -f unidata-gen-confusable $@
${unidir}/idna-mapping.el: ${srcdir}/unidata-gen.el \
${srcdir}/IdnaMappingTable.txt
$(AM_V_GEN)${emacs} -L ${srcdir} \
-l unidata-gen.el -f unidata-gen-idna-mapping $@
.PHONY: charscript.el .PHONY: charscript.el
charscript.el: ${unidir}/charscript.el charscript.el: ${unidir}/charscript.el

View file

@ -61,6 +61,6 @@ PropertyValueAliases.txt
https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt
2022-01-17 2022-01-17
confusables.txt IdnaMappingTable.txt
https://www.unicode.org/Public/security/latest/confusables.txt https://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt
2022-01-18 2022-01-18

View file

@ -1602,6 +1602,46 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)."
(insert ")") (insert ")")
(unidata-gen-charprop file (buffer-string))))) (unidata-gen-charprop file (buffer-string)))))
(defun unidata-gen-idna-mapping (&optional file)
;; Running from Makefile.
(unless file
(setq file (pop command-line-args-left)))
(let ((map (make-char-table nil)))
(with-temp-buffer
(unidata-gen--insert-file "IdnaMappingTable.txt")
(while (re-search-forward "^\\([0-9A-F]+\\)\\(?:\\.\\.\\([0-9A-F]+\\)\\)? +; +\\([^ ]+\\) +\\(?:; +\\([ 0-9A-F]+\\)\\)?"
nil t)
(let ((start (match-string 1))
(end (match-string 2))
(status (match-string 3))
(mapped (match-string 4)))
;; Make reading the file slightly faster by using `t'
;; instead of `disallowed' all over the place.
(when (string-match-p "\\`disallowed" status)
(setq status "t"))
(unless (or (equal status "valid")
(equal status "deviation"))
(set-char-table-range
map
(if end
(cons (string-to-number start 16)
(string-to-number end 16))
(string-to-number start 16))
(cond
((equal status "mapped")
(apply #'string
(mapcar (lambda (char)
(string-to-number char 16))
(split-string (string-trim mapped)))))
(t
(intern status))))))))
(with-temp-buffer
(insert "(defconst idna-mapping-table\n")
(let ((print-length nil))
(prin1 map (current-buffer)))
(insert ")")
(unidata-gen-charprop file (buffer-string)))))
;;; unidata-gen.el ends here ;;; unidata-gen.el ends here

View file

@ -26,6 +26,8 @@
(require 'cl-lib) (require 'cl-lib)
(require 'uni-confusable) (require 'uni-confusable)
(require 'ucs-normalize) (require 'ucs-normalize)
(require 'idna-mapping)
(require 'puny)
(defvar textsec--char-scripts nil) (defvar textsec--char-scripts nil)
@ -222,6 +224,18 @@ STRING isn't a single script string."
(textsec-single-script-p string1) (textsec-single-script-p string1)
(textsec-single-script-p string2))) (textsec-single-script-p string2)))
(defun textsec-domain-suspicious-p (domain)
(catch 'found
(seq-do
(lambda (char)
(when (eq (elt idna-mapping-table char) t)
(throw 'found (format "Disallowed character: `%s' (#x%x)"
(string char) char))))
domain)
(unless (puny-highly-restrictive-domain-p domain)
(throw 'found "%s is not highly restrictive"))
nil))
(provide 'textsec) (provide 'textsec)
;;; textsec.el ends here ;;; textsec.el ends here

View file

@ -110,4 +110,9 @@
(should-not (textsec-whole-script-confusable-p "paypal" "pаypаl")) (should-not (textsec-whole-script-confusable-p "paypal" "pаypаl"))
(should (textsec-whole-script-confusable-p "scope""ѕсоре"))) (should (textsec-whole-script-confusable-p "scope""ѕсоре")))
(ert-deftest test-suspiction-domain ()
(should (textsec-domain-suspicious-p "foo/bar.org"))
(should-not (textsec-domain-suspicious-p "foo.org"))
(should (textsec-domain-suspicious-p "f\N{LEFT-TO-RIGHT ISOLATE}oo.org")))
;;; textsec-tests.el ends here ;;; textsec-tests.el ends here

File diff suppressed because it is too large Load diff

View file

@ -68,4 +68,10 @@
"xn--b.com-gra")) "xn--b.com-gra"))
(should (equal (puny-encode-string "Bä.com") "xn--b.com-gra"))) (should (equal (puny-encode-string "Bä.com") "xn--b.com-gra")))
;;; TODO!
;; puny-resources/IdnaTestV2.txt has a bunch of tests, and they should
;; be implemented. However, the puny encoding does not fully
;; implement https://www.unicode.org/reports/tr46/#Conformance yet, so
;; it'll fail.
;;; puny-tests.el ends here ;;; puny-tests.el ends here