Add textsec functions for verifying email addresses

* lisp/international/characters.el (bidi-control-characters):
Rename from glyphless--bidi-control-characters for use in textsec,
and add LRM/RLM/ALM.
(update-glyphless-char-display): Adjust the code.

* lisp/international/textsec.el (textsec-local-address-suspicious-p)
(textsec-name-suspicious-p, textsec-suspicious-nonspacing-p)
(textsec-email-suspicious-p): New functions.
This commit is contained in:
Lars Ingebrigtsen 2022-01-18 13:19:55 +01:00
parent 4f23dbaa67
commit ce63f91025
3 changed files with 124 additions and 5 deletions

View file

@ -1526,8 +1526,11 @@ Setup `char-width-table' appropriate for non-CJK language environment."
;; We can't use the \N{name} things here, because this file is used
;; too early in the build process.
(defvar glyphless--bidi-control-characters
'(#x202a ; ?\N{left-to-right embedding}
(defvar bidi-control-characters
'(#x200e ; ?\N{left-to-right mark}
#x200f ; ?\N{right-to-left mark}
#x061c ; ?\N{arabic letter mark}
#x202a ; ?\N{left-to-right embedding}
#x202b ; ?\N{right-to-left embedding}
#x202d ; ?\N{left-to-right override}
#x202e ; ?\N{right-to-left override}
@ -1535,7 +1538,8 @@ Setup `char-width-table' appropriate for non-CJK language environment."
#x2067 ; ?\N{right-to-left isolate}
#x2068 ; ?\N{first strong isolate}
#x202c ; ?\N{pop directional formatting}
#x2069)) ; ?\N{pop directional isolate})
#x2069) ; ?\N{pop directional isolate}
"List of bidirectional control characters.")
(defun update-glyphless-char-display (&optional variable value)
"Make the setting of `glyphless-char-display-control' take effect.
@ -1582,8 +1586,7 @@ option `glyphless-char-display'."
(or (aref char-acronym-table from)
"UNK")))
(when (or (eq target 'format-control)
(memq from
glyphless--bidi-control-characters))
(memq from bidi-control-characters))
(set-char-table-range glyphless-char-display
from this-method)))
(setq from (1+ from))))))

View file

@ -28,6 +28,7 @@
(require 'ucs-normalize)
(require 'idna-mapping)
(require 'puny)
(require 'mail-parse)
(defvar textsec--char-scripts nil)
@ -225,6 +226,9 @@ STRING isn't a single script string."
(textsec-single-script-p string2)))
(defun textsec-domain-suspicious-p (domain)
"Say whether DOMAIN looks suspicious.
If it isn't, nil is returned. If it is, a string explaining the
problem is returned."
(catch 'found
(seq-do
(lambda (char)
@ -236,6 +240,79 @@ STRING isn't a single script string."
(throw 'found "%s is not highly restrictive"))
nil))
(defun textsec-local-address-suspicious-p (local)
"Say whether LOCAL looks suspicious.
LOCAL is the bit before \"@\" in an email address.
If it suspicious, nil is returned. If it is, a string explaining
the problem is returned."
(cond
((not (equal local (ucs-normalize-NFKC-string local)))
(format "`%s' is not in normalized format `%s'"
local (ucs-normalize-NFKC-string local)))
((textsec-mixed-numbers-p local)
(format "`%s' contains numbers from different number systems" local))
((eq (textsec-restriction-level local) 'unrestricted)
(format "`%s' isn't restrictive enough" local))
((string-match-p "\\`\\.\\|\\.\\'\\|\\.\\." local)
(format "`%s' contains invalid dots" local))))
(defun textsec-name-suspicious-p (name)
"Say whether NAME looks suspicious.
NAME is (for instance) the free-text name from an email address.
If it suspicious, nil is returned. If it is, a string explaining
the problem is returned."
(cond
((not (equal name (ucs-normalize-NFC-string name)))
(format "`%s' is not in normalized format `%s'"
name (ucs-normalize-NFC-string name)))
((seq-find (lambda (char)
(and (member char bidi-control-characters)
(not (member char
'( ?\N{left-to-right mark}
?\N{right-to-left mark}
?\N{arabic letter mark})))))
name)
(format "The string contains bidirectional control characters"))
((textsec-suspicious-nonspacing-p name))))
(defun textsec-suspicious-nonspacing-p (string)
"Say whether STRING has a suspicious use of nonspacing characters.
If it suspicious, nil is returned. If it is, a string explaining
the problem is returned."
(let ((prev nil)
(nonspace-count 0))
(catch 'found
(seq-do
(lambda (char)
(let ((nonspacing
(memq (get-char-code-property char 'general-category)
'(Cf Cc Mn))))
(when (and nonspacing
(equal char prev))
(throw 'found "Two identical nonspacing characters in a row"))
(setq nonspace-count (if nonspacing
(1+ nonspace-count)
0))
(when (> nonspace-count 4)
(throw 'found
"Excessive number of nonspacing characters in a row"))
(setq prev char)))
string)
nil)))
(defun textsec-email-suspicious-p (email)
"Say whether EMAIL looks suspicious.
If it isn't, nil is returned. If it is, a string explaining the
problem is returned."
(pcase-let* ((`(,address . ,name) (mail-header-parse-address email t))
(`(,local ,domain) (split-string address "@")))
(or
(textsec-domain-suspicious-p domain)
(textsec-local-address-suspicious-p local)
(textsec-name-suspicious-p name))))
(provide 'textsec)
;;; textsec.el ends here

View file

@ -115,4 +115,43 @@
(should-not (textsec-domain-suspicious-p "foo.org"))
(should (textsec-domain-suspicious-p "f\N{LEFT-TO-RIGHT ISOLATE}oo.org")))
(ert-deftest test-suspicious-local ()
(should-not (textsec-local-address-suspicious-p "larsi"))
(should (textsec-local-address-suspicious-p ".larsi"))
(should (textsec-local-address-suspicious-p "larsi."))
(should-not (textsec-local-address-suspicious-p "la.rsi"))
(should (textsec-local-address-suspicious-p "lar..si"))
(should-not (textsec-local-address-suspicious-p "LÅRSI"))
(should (textsec-local-address-suspicious-p "LÅRSI"))
(should (textsec-local-address-suspicious-p "larsi8")))
(ert-deftest test-suspicious-name ()
(should-not (textsec-name-suspicious-p "Lars Ingebrigtsen"))
(should (textsec-name-suspicious-p "LÅRS INGEBRIGTSEN"))
(should-not (textsec-name-suspicious-p "LÅRS INGEBRIGTSEN"))
(should (textsec-name-suspicious-p
"Lars Ingebrigtsen\N{LEFT-TO-RIGHT ISOLATE}"))
(should-not (textsec-name-suspicious-p
"Lars Ingebrigtsen\N{LEFT-TO-RIGHT MARK}"))
(should (textsec-name-suspicious-p
"\N{LEFT-TO-RIGHT MARK}\N{LEFT-TO-RIGHT MARK}Lars Ingebrigtsen"))
(should-not (textsec-name-suspicious-p
"\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}Lars Ingebrigtsen"))
(should (textsec-name-suspicious-p
"\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT MARK}Lars Ingebrigtsen")))
(ert-deftest test-suspicious-email ()
(should-not
(textsec-email-suspicious-p "Lars Ingebrigtsen <larsi@gnus.org>"))
(should
(textsec-email-suspicious-p "LÅrs Ingebrigtsen <larsi@gnus.org>"))
(should
(textsec-email-suspicious-p "Lars Ingebrigtsen <.larsi@gnus.org>"))
(should
(textsec-email-suspicious-p "Lars Ingebrigtsen <larsi@gn\N{LEFT-TO-RIGHT ISOLATE}us.org>")))
;;; textsec-tests.el ends here