Support for Unicode emoji sequences

This covers both sequences using Zero-Width-Joiner codepoints and those without. Bug#39799, I hope. * .gitignore: Add emoji-zwj.el * admin/notes/unicode: Add emoji-zwj-sequences.txt and emoji-sequences.txt references. Describe how to test after updating to a newer Unicode version. * admin/unidata/Makefile.in (all): add emoji-zwj.el as a dependency. (emoji-zwj.el): Add target plus rules for building. (gen-clean): Add emoji-zwj.el. * admin/unidata/README: Add emoji-zwj-sequences.txt and emoji-sequences.txt references. * admin/unidata/blocks.awk: Force emoji script to be used for certain codepoints that are used by the Unicode sequences. * admin/unidata/emoji-sequences.txt: New file. * admin/unidata/emoji-zwj-sequences.txt: New file. * admin/unidata/emoji-zwj.awk: New file. Derives composition-function-table rules from emoji-zwj-sequences.txt, plus hardcodes some derived manually from emoji-sequences.txt. * etc/NEWS: Announce change. * lisp/international/characters.el: Load the generated emoji-zwj.el * src/Makefile.in (emoji-zwj): New target. (temacs): Add emoji-zwj as a dependency.
2021-09-20 12:41:15 +02:00 · 2021-09-20 12:41:15 +02:00 · de289d58a4
commit de289d58a4
parent 0b98ea5fbe
11 changed files with 3078 additions and 9 deletions
--- a/.gitignore
+++ b/.gitignore
@ -212,6 +212,7 @@ etc/charsets/*.map
 lisp/international/charprop.el
 lisp/international/charscript.el
 lisp/international/cp51932.el
+lisp/international/emoji-zwj.el
 lisp/international/eucjp-ms.el
 lisp/international/uni-*.el
 lisp/language/pinyin.el
--- a/admin/notes/unicode
+++ b/admin/notes/unicode
@ -17,13 +17,15 @@ Emacs uses the following files from the Unicode Character Database
  . NormalizationTest.txt
  . SpecialCasing.txt
  . emoji-data.txt
+  . emoji-zwj-sequences.txt
+  . emoji-sequences.txt
  . BidiCharacterTest.txt

-First, the first 8 files need to be copied into admin/unidata/, and
+First, the first 10 files need to be copied into admin/unidata/, and
 the file https://www.unicode.org/copyright.html should be copied over
-copyright.html in admin/unidata (that file and emoji-data.txt might
-need trailing whitespace removed before they can be committed to the
-Emacs repository).
+copyright.html in admin/unidata (some of them might need trailing
+whitespace removed before they can be committed to the Emacs
+repository).

 Then Emacs should be rebuilt for them to take effect.  Rebuilding
 Emacs updates several derived files elsewhere in the Emacs source
@ -112,6 +114,11 @@ FONT-NAME-REGEXP is checked using `string-match'."
      (princ (format "Font not matching '%s' was used for the following characters:\n%s"
                     font-name-regexp (reverse res))))))))

+Visit "emoji-zwj-sequences.txt" and "emoji-sequences.txt" with the
+rebuilt Emacs, and check that the sample sequences are composed
+properly.  Note that your emoji font might not have glyphs for the
+newest codepoints yet.
+
 Finally, etc/NEWS should be updated to announce the support for the
 new Unicode version.

--- a/admin/unidata/Makefile.in
+++ b/admin/unidata/Makefile.in
@ -41,7 +41,7 @@ unifiles = $(addprefix ${unidir}/,$(sort $(shell sed -n 's/^[ \t][ \t]*${lparen}
 .PHONY: all

 all: ${top_srcdir}/src/macuvs.h ${unifiles} ${unidir}/charscript.el \
-  ${unidir}/charprop.el
+  ${unidir}/charprop.el ${unidir}/emoji-zwj.el

 ## Specify .elc as an order-only prereq so as to not needlessly rebuild
 ## target just because the .elc is missing.
@ -86,6 +86,13 @@ ${unidir}/charscript.el: ${blocks}
 ${unidir}/charscript.el: ${srcdir}/Blocks.txt ${srcdir}/emoji-data.txt
 	$(AM_V_GEN)$(AWK) -f ${blocks} $^ > $@

+.PHONY: emoji-zwj.el
+emoji-zwj.el: ${unidir}/emoji-zwj.el
+
+zwj = ${srcdir}/emoji-zwj.awk
+
+${unidir}/emoji-zwj.el: ${srcdir}/emoji-zwj-sequences.txt ${zwj}
+	$(AM_V_GEN)$(AWK) -f ${zwj} < $< > $@

 .PHONY: clean bootstrap-clean distclean maintainer-clean gen-clean

@ -104,6 +111,7 @@ distclean: clean
 ## from a make target, we don't delete it here.
 gen-clean:
 	rm -f ${unidir}/charscript.el*
+	rm -f ${unidir}/emoji-zwj.el*
 	rm -f ${unifiles} ${unidir}/charprop.el
 ## ref: https://lists.gnu.org/r/emacs-devel/2013-11/msg01029.html

--- a/admin/unidata/README
+++ b/admin/unidata/README
@ -36,3 +36,11 @@ http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
 emoji-data.txt
 https://www.unicode.org/Public/14.0.0/ucd/emoji/emoji-data.txt
 2021-08-26
+
+emoji-zwj-sequences.txt
+https://www.unicode.org/Public/emoji/14.0/emoji-zwj-sequences.txt
+2021-06-08
+
+emoji-sequences.txt
+https://www.unicode.org/Public/emoji/14.0/emoji-sequences.txt
+2020-08-26
--- a/admin/unidata/blocks.awk
+++ b/admin/unidata/blocks.awk
@ -221,6 +221,46 @@ FILENAME ~ "emoji-data.txt" && /^[0-9A-F].*; Emoji_Presentation / {
 }

 END {
+    ## These codepoints have Emoji_Presentation = No, but they are
+    ## used in emoji-sequences.txt and emoji-zwj-sequences.txt (with a
+    ## Variation Selector), so force them into the emoji script so
+    ## they will get composed correctly.  FIXME: delete this when we
+    ## can change the font used for a codepoint based on whether it's
+    ## followed by a VS (usually VS-16)
+    idx = 0
+    override_start[idx] = "261D"
+    override_end[idx] = "261D"
+    idx++
+    override_start[idx] = "26F9"
+    override_end[idx] = "26F9"
+    idx++
+    override_start[idx] = "270C"
+    override_end[idx] = "270D"
+    idx++
+    override_start[idx] = "2764"
+    override_end[idx] = "2764"
+    idx++
+    override_start[idx] = "1F3CB"
+    override_end[idx] = "1F3CC"
+    idx++
+    override_start[idx] = "1F3F3"
+    override_end[idx] = "1F3F4"
+    idx++
+    override_start[idx] = "1F441"
+    override_end[idx] = "1F441"
+    idx++
+    override_start[idx] = "1F575"
+    override_end[idx] = "1F575"
+
+    for (k in override_start)
+    {
+        i++
+        start[i] = override_start[k]
+        end[i] = override_end[k]
+        alt[i] = "emoji"
+        name[i] = "Autogenerated emoji (override)"
+    }
+
    print ";;; charscript.el --- character script table  -*- lexical-binding:t -*-"
    print ";;; Automatically generated from admin/unidata/Blocks.txt"
    print "(let (script-list)"
--- a/admin/unidata/emoji-sequences.txt
+++ b/admin/unidata/emoji-sequences.txt
--- a/admin/unidata/emoji-zwj-sequences.txt
+++ b/admin/unidata/emoji-zwj-sequences.txt
--- a/admin/unidata/emoji-zwj.awk
+++ b/admin/unidata/emoji-zwj.awk
@ -0,0 +1,111 @@
+#!/usr/bin/awk -f
+
+## Copyright (C) 2020 Free Software Foundation, Inc.
+
+## Author: Robert Pluim <rpluim@gmail.com>
+
+## This file is part of GNU Emacs.
+
+## GNU Emacs is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+
+## GNU Emacs is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+
+## You should have received a copy of the GNU General Public License
+## along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.
+
+### Commentary:
+
+## This script takes as input Unicode's emoji-zwj-sequences.txt
+## and produces output for Emacs's lisp/international/emoji-zwj.el.
+## It also outputs the composition sequences for flags, UK flags, and
+## skin tones which have been derived from emoji-sequences.txt by hand.
+
+## For additional details, see <https://debbugs.gnu.org/39799#8>.
+
+## Things to do after installing a new version of
+## emoji-zwj-sequences.txt and emoji-sequences.txt
+## Check the output against the old output.  See if there are any new
+## composition sequences in emoji-sequences.txt that that need to be
+## added Rebuild emacs, visit emoji-zwj-sequences.txt and
+## emoji-sequences.txt and check that the various sequences are being
+## composed properly.  Don't forget to install an appropriate font,
+## such as Noto Color Emoji.
+
+### Code:
+
+/^[0-9A-F]/ {
+    sub(/ *;.*/, "", $0)
+    num = split($0, elts)
+    if (ch[elts[1]] == "")
+    {
+        vec[elts[1]] = ""
+        ch[elts[1]] = elts[1]
+    }
+     else
+     {
+         vec[elts[1]] = vec[elts[1]] "\n"
+     }
+     vec[elts[1]] = vec[elts[1]] "\""
+    for (j = 1; j <= num; j++)
+    {
+        c = sprintf("\\N{U+%s}", elts[j])
+        vec[elts[1]] = vec[elts[1]] c
+    }
+    vec[elts[1]] = vec[elts[1]] "\""
+}
+
+END {
+     print ";;; emoji-zwj.el --- emoji zwj character composition table"
+     print ";;; Automatically generated from admin/unidata/emoji-zwj-sequences.txt"
+     print "(eval-when-compile (require 'regexp-opt))"
+     print "(dolist (elt `("
+
+     for (elt in ch)
+    {
+        printf("(#x%s .\n,(eval-when-compile (regexp-opt\n'(\n%s\n))))\n", elt, vec[elt])
+    }
+     print "))"
+     print "  (set-char-table-range composition-function-table"
+     print "                        (car elt)"
+     print "                        (nconc (char-table-range composition-function-table (car elt))"
+     print "                               (list (vector (cdr elt)"
+     print "                                             0"
+     print "                                             'compose-gstring-for-graphic))))"
+
+     print ";; The following three blocks are derived by hand from emoji-sequences.txt"
+     print ";; FIXME: add support for Emoji_Keycap_Sequence once we learn how to respect FE0F/VS-16"
+     print ";; for ASCII characters."
+
+     print ";; Flags"
+     print "(set-char-table-range composition-function-table"
+     print "                      '(#x1F1E6 . #x1F1FF)"
+     print "                      (nconc (char-table-range composition-function-table '(#x1F1E6 . #x1F1FF))"
+     print "                             (list (vector \"[\\U0001F1E6-\\U0001F1FF][\\U0001F1E6-\\U0001F1FF]\""
+     print "                                           0"
+     print "                                    'compose-gstring-for-graphic))))"
+
+     print ";; UK Flags"
+     print "(set-char-table-range composition-function-table"
+     print "                      #x1F3F4"
+     print "                      (nconc (char-table-range composition-function-table #x1F3F4)"
+     print "                             (list (vector \"\\U0001F3F4\\U000E0067\\U000E0062\\(?:\\U000E0065\\U000E006E\\U000E0067\\|\\U000E0073\\U000E0063\\U000E0074\\|\\U000E0077\\U000E006C\\U000E0073\\)\\U000E007F\""
+     print "                                           0"
+     print "                                    'compose-gstring-for-graphic))))"
+
+     print ";; Skin tones"
+     print "(set-char-table-range composition-function-table"
+     print "                      '(#x1F3FB . #x1F3FF)"
+     print "                      (nconc (char-table-range composition-function-table '(#x1F3FB . #x1F3FF))"
+     print "                             (list (vector \".[\\U0001F3FB-\\U0001F3FF]\""
+     print "                                           1"
+     print "                                    'compose-gstring-for-graphic)))))"
+
+     print "\n"
+     print "(provide 'emoji-zwj)"
+}
--- a/etc/NEWS
+++ b/etc/NEWS
@ -144,6 +144,13 @@ Emoji" by default for that script.  Use:

 to change the font used.

+++
+** Zero Width Joiner (ZWJ) and emoji sequences are now composed.
+Emacs can now compose (almost) all the Unicode-14 ZWJ and emoji
+sequences (if a suitable font is installed) so that they are displayed
+as single glyphs instead of multiple ones.  'Noto Color Emoji' is one
+such suitable font.
+
 +++
 ** New command 'execute-extended-command-for-buffer'.
 This new command, bound to 'M-S-x', works like
--- a/lisp/international/characters.el
+++ b/lisp/international/characters.el
@ -1428,8 +1428,12 @@ Setup `char-width-table' appropriate for non-CJK language environment."
 (if dump-mode
    ;; While dumping, we can't use require, and international is not
    ;; in load-path.
-    (load "international/charscript")
-  (require 'charscript))
+    (progn
+      (load "international/charscript")
+      (load "international/emoji-zwj"))
+  (progn
+    (require 'charscript)
+    (require 'emoji-zwj)))

 (map-charset-chars
 (lambda (range _ignore)
--- a/src/Makefile.in
+++ b/src/Makefile.in
@ -545,7 +545,11 @@ charscript = ${lispintdir}/charscript.el
 ${charscript}: FORCE
 	$(MAKE) -C ../admin/unidata $(notdir $@)

-${lispintdir}/characters.elc: ${charscript:.el=.elc}
+emoji-zwj = ${lispintdir}/emoji-zwj.el
+${emoji-zwj}: FORCE
+	$(MAKE) -C ../admin/unidata $(notdir $@)
+
+${lispintdir}/characters.elc: ${charscript:.el=.elc} ${emoji-zwj:.el=.elc}

 SYSTEM_TYPE = @SYSTEM_TYPE@

@ -634,7 +638,7 @@ endif
 ## This goes on to affect various things, and the emacs binary fails
 ## to start if Vinstallation_directory has the wrong value.
 temacs$(EXEEXT): $(LIBXMENU) $(ALLOBJS) $(LIBEGNU_ARCHIVE) $(EMACSRES) \
-  $(charsets) $(charscript) $(MAKE_PDUMPER_FINGERPRINT)
+  $(charsets) $(charscript) ${emoji-zwj} $(MAKE_PDUMPER_FINGERPRINT)
 	$(AM_V_CCLD)$(CC) -o $@.tmp \
 	  $(ALL_CFLAGS) $(TEMACS_LDFLAGS) $(LDFLAGS) \
 	  $(ALLOBJS) $(LIBEGNU_ARCHIVE) $(W32_RES_LINK) $(LIBES)