;;; textsec.el --- Functions for handling homoglyphs and the like -*- lexical-binding: t; -*- ;; Copyright (C) 2022 Free Software Foundation, Inc. ;; This file is part of GNU Emacs. ;; GNU Emacs is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or ;; (at your option) any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs. If not, see . ;;; Commentary: ;; ;;; Code: (require 'cl-lib) (require 'uni-confusable) (require 'ucs-normalize) (require 'idna-mapping) (require 'puny) (require 'mail-parse) (defvar textsec--char-scripts nil) (eval-and-compile (defun textsec--create-script-table (data) "Create the textsec--char-scripts char table." (setq textsec--char-scripts (make-char-table nil)) (dolist (scripts data) (dolist (range (cadr scripts)) (set-char-table-range textsec--char-scripts range (car scripts))))) (require 'uni-scripts)) (defun textsec-scripts (string) "Return a list of Unicode scripts used in STRING. The scripts returned by this function use the Unicode Script property as defined by the Unicode Standard Annex 24 (UAX#24)." (seq-map (lambda (char) (elt textsec--char-scripts char)) string)) (defun textsec-single-script-p (string) "Return non-nil if STRING is all in a single Unicode script. Note that the concept of \"single script\" used by this function isn't obvious -- some mixtures of scripts count as a \"single script\". See https://www.unicode.org/reports/tr39/#Mixed_Script_Detection for details. The Unicode scripts are as defined by the Unicode Standard Annex 24 (UAX#24)." (let ((scripts (mapcar (lambda (s) (append s ;; Some scripts used in East Asia are ;; commonly used across borders, so we add ;; those. (mapcan (lambda (script) (copy-sequence (textsec--augment-script script))) s))) (textsec-scripts string)))) (catch 'empty (cl-loop for s1 in scripts do (cl-loop for s2 in scripts ;; Common/inherited chars can be used in ;; text with all scripts. when (and (not (memq 'common s1)) (not (memq 'common s2)) (not (memq 'inherited s1)) (not (memq 'inherited s2)) (not (seq-intersection s1 s2))) do (throw 'empty nil))) t))) (defun textsec--augment-script (script) (cond ((eq script 'han) '(hangul japan korea)) ((or (eq script 'hiragana) (eq script 'katakana)) '(japan)) ((or (eq script 'hangul) (eq script 'bopomofo)) '(korea)))) (defun textsec-covering-scripts (string) "Return a minimal list of scripts used in STRING. Note that a string may have several different minimal cover sets. The scripts are as defined by the Unicode Standard Annex 24 (UAX#24)." (let* ((scripts (textsec-scripts string)) (set (car scripts))) (dolist (s scripts) (setq set (seq-union set (seq-difference s set)))) (sort (delq 'common (delq 'inherited set)) #'string<))) (defun textsec-restriction-level (string) "Say what restriction level STRING qualifies for. Levels are (in decreasing order of restrictiveness) `ascii-only', `single-script', `highly-restrictive', `moderately-restrictive', `minimally-restrictive' and `unrestricted'." (let ((scripts (textsec-covering-scripts string))) (cond ((string-match "\\`[[:ascii:]]+\\'" string) 'ascii-only) ((textsec-single-script-p string) 'single-script) ((or (null (seq-difference scripts '(latin han hiragana katakana))) (null (seq-difference scripts '(latin han bopomofo))) (null (seq-difference scripts '(latin han hangul)))) 'highly-restrictive) ((and (= (length scripts) 2) (memq 'latin scripts) ;; This list comes from ;; https://www.unicode.org/reports/tr31/#Table_Recommended_Scripts ;; (but is without latin, cyrillic and greek). (seq-intersection scripts '(arabic armenian bengali bopomofo devanagari ethiopic georgian gujarati gurmukhi hangul han hebrew hiragana katakana kannada khmer lao malayalam myanmar oriya sinhala tamil telugu thaana thai tibetan))) ;; The string is covered by Latin and any one other Recommended ;; script, except Cyrillic, Greek. 'moderately-retrictive) ;; Fixme `minimally-restrictive' -- needs well-formedness criteria ;; and Identifier Profile. (t 'unrestricted)))) (defun textsec-mixed-numbers-p (string) "Return non-nil if there are numbers from different decimal systems in STRING." (> (length (seq-uniq (mapcar (lambda (char) (get-char-code-property char 'numeric-value)) (seq-filter (lambda (char) ;; We're selecting the characters that ;; have a numeric property. (eq (get-char-code-property char 'general-category) 'Nd)) string)))) 1)) (defun textsec-ascii-confusable-p (string) "Return non-nil if STRING isn't ASCII, but is confusable with ASCII." (and (not (eq (textsec-restriction-level string) 'ascii-only)) (eq (textsec-restriction-level (textsec-unconfuse-string string)) 'ascii-only))) (defun textsec-unconfuse-string (string) "Return a de-confused version of STRING. This algorithm is described in: https://www.unicode.org/reports/tr39/#Confusable_Detection" (ucs-normalize-NFD-string (apply #'concat (seq-map (lambda (char) (or (gethash char uni-confusable-table) (string char))) (ucs-normalize-NFD-string string))))) (defun textsec-resolved-script-set (string) "Return the resolved script set for STRING. This is the minimal covering script set for STRING, but is nil is STRING isn't a single script string." (and (textsec-single-script-p string) (textsec-covering-scripts string))) (defun textsec-single-script-confusable-p (string1 string2) "Say whether STRING1 and STRING2 are single script confusables." (and (equal (textsec-unconfuse-string string1) (textsec-unconfuse-string string2)) ;; And they have to have at least one resolved script in ;; common. (seq-intersection (textsec-resolved-script-set string1) (textsec-resolved-script-set string2)))) (defun textsec-mixed-script-confusable-p (string1 string2) "Say whether STRING1 and STRING2 are mixed script confusables." (and (equal (textsec-unconfuse-string string1) (textsec-unconfuse-string string2)) ;; And they have no resolved scripts in common. (null (seq-intersection (textsec-resolved-script-set string1) (textsec-resolved-script-set string2))))) (defun textsec-whole-script-confusable-p (string1 string2) "Say whether STRING1 and STRING2 are whole script confusables." (and (textsec-mixed-script-confusable-p string1 string2) (textsec-single-script-p string1) (textsec-single-script-p string2))) (defun textsec-domain-suspicious-p (domain) "Say whether DOMAIN looks suspicious. If it isn't, nil is returned. If it is, a string explaining the problem is returned." (catch 'found (seq-do (lambda (char) (when (eq (elt idna-mapping-table char) t) (throw 'found (format "Disallowed character: `%s' (#x%x)" (string char) char)))) domain) (unless (puny-highly-restrictive-domain-p domain) (throw 'found "%s is not highly restrictive")) nil)) (defun textsec-local-address-suspicious-p (local) "Say whether LOCAL looks suspicious. LOCAL is the bit before \"@\" in an email address. If it suspicious, nil is returned. If it is, a string explaining the problem is returned." (cond ((not (equal local (ucs-normalize-NFKC-string local))) (format "`%s' is not in normalized format `%s'" local (ucs-normalize-NFKC-string local))) ((textsec-mixed-numbers-p local) (format "`%s' contains numbers from different number systems" local)) ((eq (textsec-restriction-level local) 'unrestricted) (format "`%s' isn't restrictive enough" local)) ((string-match-p "\\`\\.\\|\\.\\'\\|\\.\\." local) (format "`%s' contains invalid dots" local)))) (defun textsec-name-suspicious-p (name) "Say whether NAME looks suspicious. NAME is (for instance) the free-text name from an email address. If it suspicious, nil is returned. If it is, a string explaining the problem is returned." (cond ((not (equal name (ucs-normalize-NFC-string name))) (format "`%s' is not in normalized format `%s'" name (ucs-normalize-NFC-string name))) ((seq-find (lambda (char) (and (member char bidi-control-characters) (not (member char '( ?\N{left-to-right mark} ?\N{right-to-left mark} ?\N{arabic letter mark}))))) name) (format "The string contains bidirectional control characters")) ((textsec-suspicious-nonspacing-p name)))) (defun textsec-suspicious-nonspacing-p (string) "Say whether STRING has a suspicious use of nonspacing characters. If it suspicious, nil is returned. If it is, a string explaining the problem is returned." (let ((prev nil) (nonspace-count 0)) (catch 'found (seq-do (lambda (char) (let ((nonspacing (memq (get-char-code-property char 'general-category) '(Cf Cc Mn)))) (when (and nonspacing (equal char prev)) (throw 'found "Two identical nonspacing characters in a row")) (setq nonspace-count (if nonspacing (1+ nonspace-count) 0)) (when (> nonspace-count 4) (throw 'found "Excessive number of nonspacing characters in a row")) (setq prev char))) string) nil))) (defun textsec-email-suspicious-p (email) "Say whether EMAIL looks suspicious. If it isn't, nil is returned. If it is, a string explaining the problem is returned." (pcase-let* ((`(,address . ,name) (mail-header-parse-address email t)) (`(,local ,domain) (split-string address "@"))) (or (textsec-domain-suspicious-p domain) (textsec-local-address-suspicious-p local) (and name (textsec-name-suspicious-p name))))) (provide 'textsec) ;;; textsec.el ends here