Add support for functions that deal with Unicode scripts

* admin/unidata/Makefile.in (${unidir}/uni-scripts.el): Build
uni-scripts.el.

* admin/unidata/Scripts.txt:
* admin/unidata/ScriptExtensions.txt:
* admin/unidata/PropertyValueAliases.txt: New files from Unicode.

* admin/unidata/README: Update.

* admin/unidata/unidata-gen.el (unidata-gen-charprop): Allow
writing other data, too.
(unidata-gen-scripts, unidata-gen--read-script-aliases)
(unidata-gen--insert-file): New functions to parse the Script* files.

* lisp/international/textsec.el: Implement some functions that
work on scripts.
This commit is contained in:
Lars Ingebrigtsen 2022-01-17 15:47:37 +01:00
parent 39d4e1ca21
commit e2c8091113
9 changed files with 5536 additions and 8 deletions

View file

@ -44,7 +44,8 @@ unifiles = $(addprefix ${unidir}/,$(sort $(shell sed -n 's/^[ \t][ \t]*${lparen}
.PHONY: all
all: ${top_srcdir}/src/macuvs.h ${unifiles} ${unidir}/charscript.el \
${unidir}/charprop.el ${unidir}/emoji-zwj.el ${unidir}/emoji-labels.el
${unidir}/charprop.el ${unidir}/emoji-zwj.el ${unidir}/emoji-labels.el \
${unidir}/uni-scripts.el
## Specify .elc as an order-only prereq so as to not needlessly rebuild
## target just because the .elc is missing.
@ -82,6 +83,13 @@ ${unidir}/emoji-labels.el: ${unidir}/../international/emoji.el \
${srcdir}/emoji-test.txt
$(AM_V_GEN)${emacs} -l emoji.el -f emoji--generate-file $@
${unidir}/uni-scripts.el: ${srcdir}/unidata-gen.el \
${srcdir}/Scripts.txt \
${srcdir}/ScriptExtensions.txt \
${srcdir}/PropertyValueAliases.txt
$(AM_V_GEN)${emacs} -L ${srcdir} \
-l unidata-gen -f unidata-gen-scripts $@
.PHONY: charscript.el
charscript.el: ${unidir}/charscript.el

File diff suppressed because it is too large Load diff

View file

@ -48,3 +48,15 @@ https://www.unicode.org/Public/emoji/14.0/emoji-sequences.txt
emoji-test.txt
https://unicode.org/Public/emoji/14.0/emoji-test.txt
2021-10-28
ScriptExtensions.txt
https://www.unicode.org/Public/UCD/latest/ucd/ScriptExtensions.txt
2022-01-17
Scripts.txt
https://www.unicode.org/Public/UCD/latest/ucd/Scripts.txt
2022-01-17
PropertyValueAliases.txt
https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt
2022-01-17

View file

@ -0,0 +1,628 @@
# ScriptExtensions-14.0.0.txt
# Date: 2021-06-04, 02:19:38 GMT
# © 2021 Unicode®, Inc.
# Unicode and the Unicode Logo are registered trademarks of Unicode, Inc. in the U.S. and other countries.
# For terms of use, see http://www.unicode.org/terms_of_use.html
#
# Unicode Character Database
# For documentation, see http://www.unicode.org/reports/tr44/
#
# The Script_Extensions property indicates which characters are commonly used
# with more than one script, but with a limited number of scripts.
# For each code point, there is one or more property values. Each such value is a Script property value.
# For more information, see:
# UAX #24, Unicode Script Property: https://www.unicode.org/reports/tr24/
# Especially the sections:
# https://www.unicode.org/reports/tr24/#Assignment_Script_Values
# https://www.unicode.org/reports/tr24/#Assignment_ScriptX_Values
#
# Each Script_Extensions value in this file consists of a set
# of one or more abbreviated Script property values. The ordering of the
# values in that set is not material, but for stability in presentation
# it is given here as alphabetical.
#
# The Script_Extensions values are presented in sorted order in the file.
# They are sorted first by the number of Script property values in their sets,
# and then alphabetically by first differing Script property value.
#
# Following each distinct Script_Extensions value is the list of code
# points associated with that value, listed in code point order.
#
# All code points not explicitly listed for Script_Extensions
# have as their value the corresponding Script property value
#
# @missing: 0000..10FFFF; <script>
# ================================================
# Property: Script_Extensions
# ================================================
# Script_Extensions=Beng
1CF7 ; Beng # Mc VEDIC SIGN ATIKRAMA
# Total code points: 1
# ================================================
# Script_Extensions=Deva
1CD1 ; Deva # Mn VEDIC TONE SHARA
1CD4 ; Deva # Mn VEDIC SIGN YAJURVEDIC MIDLINE SVARITA
1CDB ; Deva # Mn VEDIC TONE TRIPLE SVARITA
1CDE..1CDF ; Deva # Mn [2] VEDIC TONE TWO DOTS BELOW..VEDIC TONE THREE DOTS BELOW
1CE2..1CE8 ; Deva # Mn [7] VEDIC SIGN VISARGA SVARITA..VEDIC SIGN VISARGA ANUDATTA WITH TAIL
1CEB..1CEC ; Deva # Lo [2] VEDIC SIGN ANUSVARA VAMAGOMUKHA..VEDIC SIGN ANUSVARA VAMAGOMUKHA WITH TAIL
1CEE..1CF1 ; Deva # Lo [4] VEDIC SIGN HEXIFORM LONG ANUSVARA..VEDIC SIGN ANUSVARA UBHAYATO MUKHA
# Total code points: 18
# ================================================
# Script_Extensions=Dupl
1BCA0..1BCA3 ; Dupl # Cf [4] SHORTHAND FORMAT LETTER OVERLAP..SHORTHAND FORMAT UP STEP
# Total code points: 4
# ================================================
# Script_Extensions=Grek
0342 ; Grek # Mn COMBINING GREEK PERISPOMENI
0345 ; Grek # Mn COMBINING GREEK YPOGEGRAMMENI
1DC0..1DC1 ; Grek # Mn [2] COMBINING DOTTED GRAVE ACCENT..COMBINING DOTTED ACUTE ACCENT
# Total code points: 4
# ================================================
# Script_Extensions=Hani
3006 ; Hani # Lo IDEOGRAPHIC CLOSING MARK
303E..303F ; Hani # So [2] IDEOGRAPHIC VARIATION INDICATOR..IDEOGRAPHIC HALF FILL SPACE
3190..3191 ; Hani # So [2] IDEOGRAPHIC ANNOTATION LINKING MARK..IDEOGRAPHIC ANNOTATION REVERSE MARK
3192..3195 ; Hani # No [4] IDEOGRAPHIC ANNOTATION ONE MARK..IDEOGRAPHIC ANNOTATION FOUR MARK
3196..319F ; Hani # So [10] IDEOGRAPHIC ANNOTATION TOP MARK..IDEOGRAPHIC ANNOTATION MAN MARK
31C0..31E3 ; Hani # So [36] CJK STROKE T..CJK STROKE Q
3220..3229 ; Hani # No [10] PARENTHESIZED IDEOGRAPH ONE..PARENTHESIZED IDEOGRAPH TEN
322A..3247 ; Hani # So [30] PARENTHESIZED IDEOGRAPH MOON..CIRCLED IDEOGRAPH KOTO
3280..3289 ; Hani # No [10] CIRCLED IDEOGRAPH ONE..CIRCLED IDEOGRAPH TEN
328A..32B0 ; Hani # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
32C0..32CB ; Hani # So [12] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DECEMBER
32FF ; Hani # So SQUARE ERA NAME REIWA
3358..3370 ; Hani # So [25] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR TWENTY-FOUR
337B..337F ; Hani # So [5] SQUARE ERA NAME HEISEI..SQUARE CORPORATION
33E0..33FE ; Hani # So [31] IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY ONE..IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
1D360..1D371 ; Hani # No [18] COUNTING ROD UNIT DIGIT ONE..COUNTING ROD TENS DIGIT NINE
1F250..1F251 ; Hani # So [2] CIRCLED IDEOGRAPH ADVANTAGE..CIRCLED IDEOGRAPH ACCEPT
# Total code points: 238
# ================================================
# Script_Extensions=Latn
0363..036F ; Latn # Mn [13] COMBINING LATIN SMALL LETTER A..COMBINING LATIN SMALL LETTER X
# Total code points: 13
# ================================================
# Script_Extensions=Nand
1CFA ; Nand # Lo VEDIC SIGN DOUBLE ANUSVARA ANTARGOMUKHA
# Total code points: 1
# ================================================
# Script_Extensions=Syrc
1DFA ; Syrc # Mn COMBINING DOT BELOW LEFT
# Total code points: 1
# ================================================
# Script_Extensions=Arab Copt
102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK
102E1..102FB ; Arab Copt # No [27] COPTIC EPACT DIGIT ONE..COPTIC EPACT NUMBER NINE HUNDRED
# Total code points: 28
# ================================================
# Script_Extensions=Arab Rohg
06D4 ; Arab Rohg # Po ARABIC FULL STOP
# Total code points: 1
# ================================================
# Script_Extensions=Arab Nkoo
FD3E ; Arab Nkoo # Pe ORNATE LEFT PARENTHESIS
FD3F ; Arab Nkoo # Ps ORNATE RIGHT PARENTHESIS
# Total code points: 2
# ================================================
# Script_Extensions=Arab Syrc
064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
0670 ; Arab Syrc # Mn ARABIC LETTER SUPERSCRIPT ALEF
# Total code points: 12
# ================================================
# Script_Extensions=Arab Thaa
FDF2 ; Arab Thaa # Lo ARABIC LIGATURE ALLAH ISOLATED FORM
FDFD ; Arab Thaa # So ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
# Total code points: 2
# ================================================
# Script_Extensions=Beng Deva
1CD5..1CD6 ; Beng Deva # Mn [2] VEDIC TONE YAJURVEDIC AGGRAVATED INDEPENDENT SVARITA..VEDIC TONE YAJURVEDIC INDEPENDENT SVARITA
1CD8 ; Beng Deva # Mn VEDIC TONE CANDRA BELOW
1CE1 ; Beng Deva # Mc VEDIC TONE ATHARVAVEDIC INDEPENDENT SVARITA
1CEA ; Beng Deva # Lo VEDIC SIGN ANUSVARA BAHIRGOMUKHA
1CED ; Beng Deva # Mn VEDIC SIGN TIRYAK
1CF5..1CF6 ; Beng Deva # Lo [2] VEDIC SIGN JIHVAMULIYA..VEDIC SIGN UPADHMANIYA
A8F1 ; Beng Deva # Mn COMBINING DEVANAGARI SIGN AVAGRAHA
# Total code points: 9
# ================================================
# Script_Extensions=Bopo Hani
302A..302D ; Bopo Hani # Mn [4] IDEOGRAPHIC LEVEL TONE MARK..IDEOGRAPHIC ENTERING TONE MARK
# Total code points: 4
# ================================================
# Script_Extensions=Bugi Java
A9CF ; Bugi Java # Lm JAVANESE PANGRANGKEP
# Total code points: 1
# ================================================
# Script_Extensions=Cprt Linb
10102 ; Cprt Linb # Po AEGEAN CHECK MARK
10137..1013F ; Cprt Linb # So [9] AEGEAN WEIGHT BASE UNIT..AEGEAN MEASURE THIRD SUBUNIT
# Total code points: 10
# ================================================
# Script_Extensions=Cyrl Glag
0484 ; Cyrl Glag # Mn COMBINING CYRILLIC PALATALIZATION
0487 ; Cyrl Glag # Mn COMBINING CYRILLIC POKRYTIE
2E43 ; Cyrl Glag # Po DASH WITH LEFT UPTURN
A66F ; Cyrl Glag # Mn COMBINING CYRILLIC VZMET
# Total code points: 4
# ================================================
# Script_Extensions=Cyrl Latn
0485..0486 ; Cyrl Latn # Mn [2] COMBINING CYRILLIC DASIA PNEUMATA..COMBINING CYRILLIC PSILI PNEUMATA
# Total code points: 2
# ================================================
# Script_Extensions=Cyrl Perm
0483 ; Cyrl Perm # Mn COMBINING CYRILLIC TITLO
# Total code points: 1
# ================================================
# Script_Extensions=Cyrl Syrc
1DF8 ; Cyrl Syrc # Mn COMBINING DOT ABOVE LEFT
# Total code points: 1
# ================================================
# Script_Extensions=Deva Gran
1CD3 ; Deva Gran # Po VEDIC SIGN NIHSHVASA
1CF3 ; Deva Gran # Lo VEDIC SIGN ROTATED ARDHAVISARGA
1CF8..1CF9 ; Deva Gran # Mn [2] VEDIC TONE RING ABOVE..VEDIC TONE DOUBLE RING ABOVE
# Total code points: 4
# ================================================
# Script_Extensions=Deva Nand
1CE9 ; Deva Nand # Lo VEDIC SIGN ANUSVARA ANTARGOMUKHA
# Total code points: 1
# ================================================
# Script_Extensions=Deva Shrd
1CD7 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA
1CD9 ; Deva Shrd # Mn VEDIC TONE YAJURVEDIC KATHAKA INDEPENDENT SVARITA SCHROEDER
1CDC..1CDD ; Deva Shrd # Mn [2] VEDIC TONE KATHAKA ANUDATTA..VEDIC TONE DOT BELOW
1CE0 ; Deva Shrd # Mn VEDIC TONE RIGVEDIC KASHMIRI INDEPENDENT SVARITA
# Total code points: 5
# ================================================
# Script_Extensions=Deva Taml
A8F3 ; Deva Taml # Lo DEVANAGARI SIGN CANDRABINDU VIRAMA
# Total code points: 1
# ================================================
# Script_Extensions=Geor Latn
10FB ; Geor Latn # Po GEORGIAN PARAGRAPH SEPARATOR
# Total code points: 1
# ================================================
# Script_Extensions=Gran Taml
0BE6..0BEF ; Gran Taml # Nd [10] TAMIL DIGIT ZERO..TAMIL DIGIT NINE
0BF0..0BF2 ; Gran Taml # No [3] TAMIL NUMBER TEN..TAMIL NUMBER ONE THOUSAND
0BF3 ; Gran Taml # So TAMIL DAY SIGN
11301 ; Gran Taml # Mn GRANTHA SIGN CANDRABINDU
11303 ; Gran Taml # Mc GRANTHA SIGN VISARGA
1133B..1133C ; Gran Taml # Mn [2] COMBINING BINDU BELOW..GRANTHA SIGN NUKTA
11FD0..11FD1 ; Gran Taml # No [2] TAMIL FRACTION ONE QUARTER..TAMIL FRACTION ONE HALF-1
11FD3 ; Gran Taml # No TAMIL FRACTION THREE QUARTERS
# Total code points: 21
# ================================================
# Script_Extensions=Gujr Khoj
0AE6..0AEF ; Gujr Khoj # Nd [10] GUJARATI DIGIT ZERO..GUJARATI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Guru Mult
0A66..0A6F ; Guru Mult # Nd [10] GURMUKHI DIGIT ZERO..GURMUKHI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Hani Latn
A700..A707 ; Hani Latn # Sk [8] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER CHINESE TONE YANG RU
# Total code points: 8
# ================================================
# Script_Extensions=Hira Kana
3031..3035 ; Hira Kana # Lm [5] VERTICAL KANA REPEAT MARK..VERTICAL KANA REPEAT MARK LOWER HALF
3099..309A ; Hira Kana # Mn [2] COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK..COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
309B..309C ; Hira Kana # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
30A0 ; Hira Kana # Pd KATAKANA-HIRAGANA DOUBLE HYPHEN
30FC ; Hira Kana # Lm KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF70 ; Hira Kana # Lm HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
FF9E..FF9F ; Hira Kana # Lm [2] HALFWIDTH KATAKANA VOICED SOUND MARK..HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
# Total code points: 14
# ================================================
# Script_Extensions=Knda Nand
0CE6..0CEF ; Knda Nand # Nd [10] KANNADA DIGIT ZERO..KANNADA DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Latn Mong
202F ; Latn Mong # Zs NARROW NO-BREAK SPACE
# Total code points: 1
# ================================================
# Script_Extensions=Mani Ougr
10AF2 ; Mani Ougr # Po MANICHAEAN PUNCTUATION DOUBLE DOT WITHIN DOT
# Total code points: 1
# ================================================
# Script_Extensions=Mong Phag
1802..1803 ; Mong Phag # Po [2] MONGOLIAN COMMA..MONGOLIAN FULL STOP
1805 ; Mong Phag # Po MONGOLIAN FOUR DOTS
# Total code points: 3
# ================================================
# Script_Extensions=Arab Syrc Thaa
061C ; Arab Syrc Thaa # Cf ARABIC LETTER MARK
# Total code points: 1
# ================================================
# Script_Extensions=Arab Thaa Yezi
0660..0669 ; Arab Thaa Yezi # Nd [10] ARABIC-INDIC DIGIT ZERO..ARABIC-INDIC DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Beng Cakm Sylo
09E6..09EF ; Beng Cakm Sylo # Nd [10] BENGALI DIGIT ZERO..BENGALI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Cakm Mymr Tale
1040..1049 ; Cakm Mymr Tale # Nd [10] MYANMAR DIGIT ZERO..MYANMAR DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Cpmn Cprt Linb
10100..10101 ; Cpmn Cprt Linb # Po [2] AEGEAN WORD SEPARATOR LINE..AEGEAN WORD SEPARATOR DOT
# Total code points: 2
# ================================================
# Script_Extensions=Cprt Lina Linb
10107..10133 ; Cprt Lina Linb # No [45] AEGEAN NUMBER ONE..AEGEAN NUMBER NINETY THOUSAND
# Total code points: 45
# ================================================
# Script_Extensions=Deva Gran Knda
1CF4 ; Deva Gran Knda # Mn VEDIC TONE CANDRA ABOVE
# Total code points: 1
# ================================================
# Script_Extensions=Deva Gran Latn
20F0 ; Deva Gran Latn # Mn COMBINING ASTERISK ABOVE
# Total code points: 1
# ================================================
# Script_Extensions=Hani Hira Kana
303C ; Hani Hira Kana # Lo MASU MARK
303D ; Hani Hira Kana # Po PART ALTERNATION MARK
# Total code points: 2
# ================================================
# Script_Extensions=Kali Latn Mymr
A92E ; Kali Latn Mymr # Po KAYAH LI SIGN CWI
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva Gran Knda
1CD0 ; Beng Deva Gran Knda # Mn VEDIC TONE KARSHANA
1CD2 ; Beng Deva Gran Knda # Mn VEDIC TONE PRENKHA
# Total code points: 2
# ================================================
# Script_Extensions=Buhd Hano Tagb Tglg
1735..1736 ; Buhd Hano Tagb Tglg # Po [2] PHILIPPINE SINGLE PUNCTUATION..PHILIPPINE DOUBLE PUNCTUATION
# Total code points: 2
# ================================================
# Script_Extensions=Deva Dogr Kthi Mahj
0966..096F ; Deva Dogr Kthi Mahj # Nd [10] DEVANAGARI DIGIT ZERO..DEVANAGARI DIGIT NINE
# Total code points: 10
# ================================================
# Script_Extensions=Bopo Hang Hani Hira Kana
3003 ; Bopo Hang Hani Hira Kana # Po DITTO MARK
3013 ; Bopo Hang Hani Hira Kana # So GETA MARK
301C ; Bopo Hang Hani Hira Kana # Pd WAVE DASH
301D ; Bopo Hang Hani Hira Kana # Ps REVERSED DOUBLE PRIME QUOTATION MARK
301E..301F ; Bopo Hang Hani Hira Kana # Pe [2] DOUBLE PRIME QUOTATION MARK..LOW DOUBLE PRIME QUOTATION MARK
3030 ; Bopo Hang Hani Hira Kana # Pd WAVY DASH
3037 ; Bopo Hang Hani Hira Kana # So IDEOGRAPHIC TELEGRAPH LINE FEED SEPARATOR SYMBOL
FE45..FE46 ; Bopo Hang Hani Hira Kana # Po [2] SESAME DOT..WHITE SESAME DOT
# Total code points: 10
# ================================================
# Script_Extensions=Arab Nkoo Rohg Syrc Thaa Yezi
060C ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC COMMA
061B ; Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC SEMICOLON
# Total code points: 2
# ================================================
# Script_Extensions=Bopo Hang Hani Hira Kana Yiii
3001..3002 ; Bopo Hang Hani Hira Kana Yiii # Po [2] IDEOGRAPHIC COMMA..IDEOGRAPHIC FULL STOP
3008 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT ANGLE BRACKET
3009 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT ANGLE BRACKET
300A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT DOUBLE ANGLE BRACKET
300B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT DOUBLE ANGLE BRACKET
300C ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT CORNER BRACKET
300D ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT CORNER BRACKET
300E ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE CORNER BRACKET
300F ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE CORNER BRACKET
3010 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT BLACK LENTICULAR BRACKET
3011 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT BLACK LENTICULAR BRACKET
3014 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT TORTOISE SHELL BRACKET
3015 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT TORTOISE SHELL BRACKET
3016 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE LENTICULAR BRACKET
3017 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE LENTICULAR BRACKET
3018 ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE TORTOISE SHELL BRACKET
3019 ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE TORTOISE SHELL BRACKET
301A ; Bopo Hang Hani Hira Kana Yiii # Ps LEFT WHITE SQUARE BRACKET
301B ; Bopo Hang Hani Hira Kana Yiii # Pe RIGHT WHITE SQUARE BRACKET
30FB ; Bopo Hang Hani Hira Kana Yiii # Po KATAKANA MIDDLE DOT
FF61 ; Bopo Hang Hani Hira Kana Yiii # Po HALFWIDTH IDEOGRAPHIC FULL STOP
FF62 ; Bopo Hang Hani Hira Kana Yiii # Ps HALFWIDTH LEFT CORNER BRACKET
FF63 ; Bopo Hang Hani Hira Kana Yiii # Pe HALFWIDTH RIGHT CORNER BRACKET
FF64..FF65 ; Bopo Hang Hani Hira Kana Yiii # Po [2] HALFWIDTH IDEOGRAPHIC COMMA..HALFWIDTH KATAKANA MIDDLE DOT
# Total code points: 26
# ================================================
# Script_Extensions=Deva Knda Mlym Orya Taml Telu
1CDA ; Deva Knda Mlym Orya Taml Telu # Mn VEDIC TONE DOUBLE SVARITA
# Total code points: 1
# ================================================
# Script_Extensions=Adlm Arab Nkoo Rohg Syrc Thaa Yezi
061F ; Adlm Arab Nkoo Rohg Syrc Thaa Yezi # Po ARABIC QUESTION MARK
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva Gran Knda Nand Orya Telu Tirh
1CF2 ; Beng Deva Gran Knda Nand Orya Telu Tirh # Lo VEDIC SIGN ARDHAVISARGA
# Total code points: 1
# ================================================
# Script_Extensions=Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc
0640 ; Adlm Arab Mand Mani Ougr Phlp Rohg Sogd Syrc # Lm ARABIC TATWEEL
# Total code points: 1
# ================================================
# Script_Extensions=Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh
A836..A837 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So [2] NORTH INDIC QUARTER MARK..NORTH INDIC PLACEHOLDER MARK
A838 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # Sc NORTH INDIC RUPEE MARK
A839 ; Deva Dogr Gujr Guru Khoj Kthi Mahj Modi Sind Takr Tirh # So NORTH INDIC QUANTITY MARK
# Total code points: 4
# ================================================
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh
0952 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN ANUDATTA
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh
0951 ; Beng Deva Gran Gujr Guru Knda Latn Mlym Orya Shrd Taml Telu Tirh # Mn DEVANAGARI STRESS SIGN UDATTA
# Total code points: 1
# ================================================
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh
A833..A835 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Modi Nand Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE SIXTEENTH..NORTH INDIC FRACTION THREE SIXTEENTHS
# Total code points: 3
# ================================================
# Script_Extensions=Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh
A830..A832 ; Deva Dogr Gujr Guru Khoj Knda Kthi Mahj Mlym Modi Nand Sind Takr Tirh # No [3] NORTH INDIC FRACTION ONE QUARTER..NORTH INDIC FRACTION THREE QUARTERS
# Total code points: 3
# ================================================
# Script_Extensions=Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
0964 ; Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DANDA
# Total code points: 1
# ================================================
# Script_Extensions=Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh
0965 ; Beng Deva Dogr Gong Gonm Gran Gujr Guru Knda Limb Mahj Mlym Nand Orya Sind Sinh Sylo Takr Taml Telu Tirh # Po DEVANAGARI DOUBLE DANDA
# Total code points: 1
# EOF

2991
admin/unidata/Scripts.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -1449,20 +1449,24 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)."
(format ";;; %s ends here\n" basename)))))
(or noninteractive (message "Generating %s...done" file)))
(defun unidata-gen-charprop (&optional charprop-file)
(defun unidata-gen-charprop (&optional charprop-file text)
(or charprop-file (setq charprop-file (pop command-line-args-left)))
(with-temp-file charprop-file
(insert ";; Automatically generated by unidata-gen.el."
" -*- lexical-binding: t -*-\n"
";; See the admin/unidata/ directory in the Emacs sources.\n")
(dolist (elt unidata-file-alist)
(dolist (proplist (cdr elt))
(insert (format "(define-char-code-property '%S %S\n %S)\n"
(unidata-prop-prop proplist) (car elt)
(unidata-prop-docstring proplist)))))
(if text
(insert text)
(dolist (elt unidata-file-alist)
(dolist (proplist (cdr elt))
(insert (format "(define-char-code-property '%S %S\n %S)\n"
(unidata-prop-prop proplist) (car elt)
(unidata-prop-docstring proplist))))))
(or noninteractive (message "Writing %s..." charprop-file))
(insert "\n"
"(provide 'charprop)\n"
(format "(provide '%s)\n"
(file-name-sans-extension
(file-name-nondirectory charprop-file)))
" \n"
";; Local Variables:\n"
";; coding: utf-8\n"
@ -1473,6 +1477,105 @@ Property value is a symbol `o' (Open), `c' (Close), or `n' (None)."
(format ";;; %s ends here\n"
(file-name-nondirectory charprop-file)))))
(defun unidata-gen-scripts (&optional file)
;; Running from Makefile.
(unless file
(setq file (pop command-line-args-left)))
(let ((aliases (unidata-gen--read-script-aliases))
(table (make-char-table nil))
(segmented (make-hash-table :test #'equal)))
;; First parse the scripts.
(with-temp-buffer
(unidata-gen--insert-file "Scripts.txt")
(while (not (eobp))
;; 1700..1711 ; Tagalog # Lo [18] TAGALOG LETTER A..TAGALOG
(when (looking-at "\\([0-9A-F]+\\)\\(?:\\.\\.\\([0-9A-F]+\\)\\)? +; +\\([^ ]+\\) +#")
(let ((start (string-to-number (match-string 1) 16))
(end (and (match-string 2)
(string-to-number (match-string 2) 16)))
(scripts (list (intern (string-replace
"_" "-"
(downcase (match-string 3)))))))
(set-char-table-range
table (if end (cons start end) start) scripts)))
(forward-line 1)))
;; Then parse the file that lists "other scripts" that characters
;; may appear in, and add those.
(with-temp-buffer
(unidata-gen--insert-file "ScriptExtensions.txt")
(while (not (eobp))
;; 102E0 ; Arab Copt # Mn COPTIC EPACT THOUSANDS MARK
(when (looking-at "\\([0-9A-F]+\\)\\(?:\\.\\.\\([0-9A-F]+\\)\\)? +; +\\([^#]+\\)")
(let ((start (string-to-number (match-string 1) 16))
(end (and (match-string 2)
(string-to-number (match-string 2) 16)))
(scripts
(mapcar
(lambda (alias)
(intern (string-replace
"_" "-" (downcase
(gethash alias aliases)))))
(split-string (string-trim (match-string 3))))))
(dolist (script scripts)
(dotimes (i (- (1+ (or end start)) start))
(set-char-table-range
table (+ i start)
(append (elt table (+ i start)) (list script)))))))
(forward-line 1)))
;; Then go through the data and collect into buckets based on
;; identical script lists.
(map-char-table
(lambda (key value)
;; `map-char-table' is reused, so copy it.
(push (if (consp key)
(cons (car key) (cdr key))
key)
;; Keep the first element first, but sort the rest.
(gethash (cons (car value)
(sort (remq (car value) value) #'string<))
segmented)))
table)
;; Then go through the data and collect into buckets based on
(let ((scripts nil))
(maphash
(lambda (segment chars)
(push (cons segment chars) scripts))
segmented)
(setq scripts (sort scripts (lambda (s1 s2)
(string< (caar s1) (caar s2)))))
(with-temp-buffer
(insert "(textsec--create-script-table '(\n")
(dolist (script scripts)
(insert "(" (prin1-to-string (car script)) "\n")
(insert " " (prin1-to-string (cdr script)))
(insert ")\n"))
(insert "))\n")
;; Write the file.
(unidata-gen-charprop file (buffer-string))))))
(defun unidata-gen--read-script-aliases ()
(let ((aliases (make-hash-table :test #'equal)))
(with-temp-buffer
(unidata-gen--insert-file "PropertyValueAliases.txt")
(unless (re-search-forward "^# Script " nil t)
(error "Can't find the Script section"))
(forward-line 2)
(while (looking-at "sc *;")
(let ((elem (split-string (buffer-substring (point) (line-end-position))
";" nil "[ \t]+")))
(setf (gethash (nth 1 elem) aliases)
(nth 2 elem)))
(forward-line 1))
aliases)))
(defun unidata-gen--insert-file (name)
(insert-file-contents
(expand-file-name (concat "../admin/unidata/" name)
data-directory)))
;;; unidata-gen.el ends here

View file

@ -951,6 +951,10 @@ The input must be encoded text.
* Lisp Changes in Emacs 29.1
---
** The Gnus range functions have been moved to a new library, range.el.
All the old names have been made obsolete.
+++
** New function 'function-alias-p'.
This predicate says whether an object is a function alias, and if it

View file

@ -0,0 +1,95 @@
;;; textsec.el --- Functions for handling homoglyphs and the like -*- lexical-binding: t; -*-
;; Copyright (C) 2022 Free Software Foundation, Inc.
;; This file is part of GNU Emacs.
;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
;;; Commentary:
;;
;;; Code:
(require 'cl-lib)
(defvar textsec--char-scripts nil)
(eval-and-compile
(defun textsec--create-script-table (data)
"Create the textsec--char-scripts char table."
(setq textsec--char-scripts (make-char-table nil))
(dolist (scripts data)
(dolist (range (cadr scripts))
(set-char-table-range textsec--char-scripts
range (car scripts)))))
(require 'uni-scripts))
(defun textsec-scripts (string)
"Return a list of scripts used in STRING."
(seq-map (lambda (char)
(elt textsec--char-scripts char))
string))
(defun textsec-single-script-p (string)
"Return non-nil if STRING is all in a single script.
Note that the concept of \"single script\" used by this function
isn't obvious -- some mixtures of scripts count as a \"single
script\. See
https://www.unicode.org/reports/tr39/#Mixed_Script_Detection
for details."
(let ((scripts (mapcar (lambda (s)
(append s
(mapcan (lambda (script)
(copy-sequence
(textsec--augment-script script)))
s)))
(textsec-scripts string))))
(catch 'empty
(cl-loop for s1 in scripts
do (cl-loop for s2 in scripts
when (and (not (memq 'common s1))
(not (memq 'common s2))
(not (memq 'inherited s1))
(not (memq 'inherited s2))
(not (seq-intersection s1 s2)))
do (throw 'empty nil)))
t)))
(defun textsec--augment-script (script)
(cond
((eq script 'han)
'(hangul japan korea))
((or (eq script 'hiragana)
(eq script 'katakana))
'(japan))
((or (eq script 'hangul)
(eq script 'bopomofo))
'(korea))))
(defun textsec-covering-scripts (string)
"Return a minimal list of scripts used in STRING."
(let* ((scripts (textsec-scripts string))
(set (car scripts)))
(dolist (s scripts)
(setq set (seq-union set (seq-difference s set))))
(delq 'common (delq 'inherited set))))
(provide 'textsec)
;;; textsec.el ends here

View file

@ -0,0 +1,72 @@
;;; textsec-tests.el --- Tests for textsec.el -*- lexical-binding: t; -*-
;; Copyright (C) 2022 Free Software Foundation, Inc.
;; This file is part of GNU Emacs.
;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.
;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>.
;;; Commentary:
;;
;;; Code:
(require 'textsec)
(require 'ert)
(require 'ert-x)
(ert-deftest test-scripts ()
(should (equal (textsec-scripts "Circle")
'((latin) (latin) (latin) (latin) (latin) (latin))))
(should (textsec-single-script-p "Circle"))
(should (equal (textsec-scripts "СігсӀе")
'((cyrillic) (cyrillic) (cyrillic)
(cyrillic) (cyrillic) (cyrillic))))
(should (textsec-single-script-p "СігсӀе"))
(should (equal (textsec-scripts "Сirсlе")
'((cyrillic) (latin) (latin) (cyrillic) (latin) (cyrillic))))
(should-not (textsec-single-script-p "Сirсlе"))
(should (equal (textsec-scripts "Circ1e")
'((latin) (latin) (latin) (latin) (common) (latin))))
(should (textsec-single-script-p "Circ1e"))
(should (equal (textsec-scripts "C𝗂𝗋𝖼𝗅𝖾")
'((latin) (common) (common) (common) (common) (common))))
(should (textsec-single-script-p "C𝗂𝗋𝖼𝗅𝖾"))
(should (equal (textsec-scripts "𝖢𝗂𝗋𝖼𝗅𝖾")
'((common) (common) (common) (common) (common) (common))))
(should (textsec-single-script-p "𝖢𝗂𝗋𝖼𝗅𝖾"))
(should (equal (textsec-scripts "〆切")
'((common han) (han))))
(should (textsec-single-script-p "〆切"))
(should (equal (textsec-scripts "ねガ")
'((hiragana) (katakana))))
(should (textsec-single-script-p "ねガ")))
(ert-deftest test-minimal-scripts ()
(should (equal (textsec-covering-scripts "Circle")
'(latin)))
(should (equal (textsec-covering-scripts "Сirсlе")
'(cyrillic latin)))
(should (equal (textsec-covering-scripts "〆切")
'(han))))
;;; textsec-tests.el ends here