*** empty log message ***

2002-07-24 23:01:32 +00:00 · 2002-07-24 23:01:32 +00:00 · fc2938d180
commit fc2938d180
parent e65186d5b2
4 changed files with 420 additions and 60 deletions
--- a/etc/NEWS
+++ b/etc/NEWS
@ -110,13 +110,48 @@ now look at the character after point.  If a face or faces are
 specified for that character, the commands by default customize those
 faces.

+** New language environments: French, Cyrillic-KOI8-U, Windows-1251,
+Cyrillic-KOI8-T, Bulgarian, Belarusian, Ukrainian, UTF-8,
+Windows-1255, Welsh, Latin-7, Lithuanian, Latvian.
+
+** New input methods: latin-alt-postfix, latin-postfix, latin-prefix,
+ukrainian-computer, belarusian, bulgarian-bds, russian-computer,
+vietnamese-telex, lithuanian-numeric, lithuanian-keyboard,
+latvian-keyboard, welsh, georgian, rfc1345, ucs, sgml,
+bulgarian-phonetic, dutch.
+
+** Many new coding systems are available by loading the `code-pages'
+library.  These include complete versions of most of those in
+codepage.el, based Unicode mappings.
+
+** The utf-8 coding system has been enhanced.  Untranslatable utf-8
+sequences (mostly representing CJK characters) are composed into
+single quasi-characters.  By loading the library utf-8-subst, you can
+arrange to translate many utf-8 CJK character sequences into real
+Emacs characters in a similar way to the Mule-UCS system.  The utf-8
+coding system will now encode characters from most of Emacs's
+one-dimensional internal charsets, specifically the ISO-8859 ones.
+
+** New command `ucs-insert' inserts a character specified by its
+Unicode.
+
 +++
-** Limited support for charset unification has been added.
-By default, Emacs now knows how to translate latin-N chars between their
-charset and some other latin-N charset or unicode.  You can force a
-more complete unification by calling (unify-8859-on-decoding-mode 1).
-That maps all the Latin-N character sets into either Latin-1
-or Unicode characters.
+** Limited support for character unification has been added.
+Emacs now knows how to translate Latin-N chars between their charset
+and some other Latin-N charset or Unicode.  By default this
+translation will happen automatically on encoding.  Quail input
+methods use the translations to make the input conformant with the
+encoding of the buffer in which it's being used where possible.
+
+You can force a more complete unification with the user option
+unify-8859-on-decoding-mode.  That maps all the Latin-N character sets
+into Unicode characters (from the latin-iso8859-1 and
+mule-unicode-0100-24ff charsets) on decoding.
+
+** There is support for decoding Greek and Cyrillic characters into
+either Unicode (the mule-unicode charsets) or the iso-8859 charsets,
+when possible.  The latter are more space-efficient.  This is
+controlled by user option utf-8-fragment-on-decoding.

 ---
 ** The scrollbar under LessTif or Motif has a smoother drag-scrolling.
@ -940,6 +975,9 @@ mode-lines in inverse-video.

 * Lisp Changes in Emacs 21.4

+** New CCL functions `lookup-character' and `lookup-integer' access
+hash tables defined by the Lisp function `define-translation-hash-table'.
+
 ** There is a new Warnings facility; see the functions `warn'
 and `display-warning'.

@ -10825,7 +10863,7 @@ select one of those items.
 ----------------------------------------------------------------------
 Copyright information:

-Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc.
+Copyright (C) 1999, 2000, 2001, 2002 Free Software Foundation, Inc.

   Permission is granted to anyone to make or distribute verbatim copies
   of this document as received, in any medium, provided that the
--- a/etc/PROBLEMS
+++ b/etc/PROBLEMS
@ -1,6 +1,50 @@
 This file describes various problems that have been encountered
 in compiling, installing and running GNU Emacs.

+
+* Mule-UCS loads very slowly.
+
+Changes to Emacs internals interact badly with Mule-UCS's `un-define'
+library, which is the usual interface to Mule-UCS.  Apply the
+following patch to Mule-UCS 0.84 and rebuild it.  That will help,
+though loading will still be slower than in Emacs 20.  (Some
+distributions, such as Debian, may already have applied such a patch.)
+
+--- lisp/un-define.el	6 Mar 2001 22:41:38 -0000	1.30
+++ lisp/un-define.el	19 Apr 2002 18:34:26 -0000
+@@ -610,13 +624,21 @@ by calling post-read-conversion and pre-
+ 
+  (mapcar
+   (lambda (x)
+-    (mapcar
+-     (lambda (y)
+-       (mucs-define-coding-system
+-	(nth 0 y) (nth 1 y) (nth 2 y)
+-	(nth 3 y) (nth 4 y) (nth 5 y) (nth 6 y))
+-       (coding-system-put (car y) 'alias-coding-systems (list (car x))))
+-     (cdr x)))
+    (if (fboundp 'register-char-codings)
+	;; Mule 5, where we don't need the eol-type specified and
+	;; register-char-codings may be very slow for these coding
+	;; system definitions.
+	(let ((y (cadr x)))
+	  (mucs-define-coding-system
+	   (car x) (nth 1 y) (nth 2 y)
+	   (nth 3 y) (nth 4 y) (nth 5 y)))
+      (mapcar
+       (lambda (y)
+	 (mucs-define-coding-system
+	  (nth 0 y) (nth 1 y) (nth 2 y)
+	  (nth 3 y) (nth 4 y) (nth 5 y) (nth 6 y))
+	 (coding-system-put (car y) 'alias-coding-systems (list (car x)))))
+      (cdr x)))
+   `((utf-8
+      (utf-8-unix
+       ?u "UTF-8 coding system"
+
+Note that Emacs has native support for Unicode, roughly equivalent to
+Mule-UCS's, so you may not need it.
+
 * Building Emacs with GCC 2.9x fails in the `src' directory.

 This may happen if you use a development version of GNU `cpp' from one
@ -115,7 +159,9 @@ should now succeed.
 * JPEG images aren't displayed.

 This has been reported when Emacs is built with jpeg-6a library.
-Upgrading to jpeg-6b solves the problem.
+Upgrading to jpeg-6b solves the problem.  Configure checks for the
+correct version, but this problem could occur if a binary built
+against a shared libjpeg is run on a system with an older version.

 * Building `ctags' for MS-Windows with the MinGW port of GCC fails.

@ -386,14 +432,13 @@ ought to recognize the Windows language-change event and set up the
 appropriate keyboard encoding automatically, but it doesn't do that
 yet.)

-Multilingual text put into the Windows clipboard by other Windows
-applications cannot be safely pasted into Emacs (as of v21.2).  This
-is because Windows uses Unicode to represent multilingual text, but
-Emacs does not yet support Unicode well enough to decode it.  This
-means that Emacs can only interchange non-ASCII text with other
-Windows programs if the characters are in the system codepage.
-Reportedly, a partial solution is to install the Mule-UCS package and
-set selection-coding-system to utf-16-le-dos.
+Windows uses UTF-16 encoding to deal with multilingual text (text not
+encodable in the `system codepage') in the clipboard.  To deal with
+this, load the library `utf-16' and use `set-selection-coding-system'
+to set the clipboard coding system to `utf-16-le-dos'.  This won't
+cope with Far Eastern (`CJK') text; if necessary, install the Mule-UCS
+package (see etc/MORE.STUFF), whose `utf-16-le-dos' coding system does
+encode a lot of CJK characters.

 The %b specifier for format-time-string does not produce abbreviated
 month names with consistent widths for some locales on some versions
@ -492,10 +537,9 @@ src/s/hpux10.h.

 * Crashes when displaying GIF images in Emacs built with version
 libungif-4.1.0 are resolved by using version libungif-4.1.0b1.
-
-Beginning with version 21.3, Emacs refuses to link against libungif
-whose version is 4.1.0 or older (the `configure' script behaves as if
-libungif were not available at all).
+Configure checks for the correct version, but this problem could occur
+if a binary built against a shared libungif is run on a system with an
+older version.

 * Font Lock displays portions of the buffer in incorrect faces.

@ -596,9 +640,8 @@ this problem by putting this in your `.emacs' file:

 (setq ange-ftp-ftp-program-args '("-i" "-n" "-g" "-v" "--prompt" "")

-* Some versions of the W3 package released before Emacs 21.1 don't run
-properly with Emacs 21.  These problems are fixed in W3 version
-4.0pre.47.
+* Versions of the W3 package released before Emacs 21.1 don't run
+under Emacs 21.  This fixed in W3 version 4.0pre.47.

 * On AIX, if linking fails because libXbsd isn't found, check if you
 are compiling with the system's `cc' and CFLAGS containing `-O5'.  If
@ -633,43 +676,6 @@ Version 1 of OpenLDAP is now deprecated.  If you are still using it,
 please upgrade to version 2.  As a temporary workaround, remove
 argument "-x" from the variable `ldap-ldapsearch-args'.

-* Unicode characters are not unified with other Mule charsets.
-
-As of v21.1, Emacs charsets are still not unified.  This means that
-characters which belong to charsets such as Latin-2, Greek, Hebrew,
-etc. and the same characters in the `mule-unicode-*' charsets are
-different characters, as far as Emacs is concerned.  For example, text
-which includes Unicode characters from the Latin-2 locale cannot be
-encoded by Emacs with ISO 8859-2 coding system; and if you yank Greek
-text from a buffer whose buffer-file-coding-system is greek-iso-8bit
-into a mule-unicode-0100-24ff buffer, Emacs won't be able to save that
-buffer neither as ISO 8859-7 nor as UTF-8.
-
-To work around this, install some add-on package such as Mule-UCS.
-
-* Problems when using Emacs with UTF-8 locales
-
-Some systems, including recent versions of GNU/Linux, have terminals
-or X11 subsystems that can be configured to provide Unicode/UTF-8
-input and display.  Normally, such a system sets environment variables
-such as LANG, LC_CTYPE, or LC_ALL to a string which ends with a
-`.UTF-8'.  For example, a system like this in a French locale might
-use `fr_FR.UTF-8' as the value of LANG.
-
-Since Unicode support in Emacs, as of v21.1, is not yet complete (see
-the previous entry in this file), UTF-8 support is not enabled by
-default, even in UTF-8 locales.  Thus, some Emacs features, such as
-non-ASCII keyboard input, might appear to be broken in these locales.
-To solve these problems, you need to turn on some options in your
-`.emacs' file.  Specifically, the following customizations should make
-Emacs work correctly with UTF-8 input and text:
-
-    (setq locale-coding-system 'utf-8)
-    (set-terminal-coding-system 'utf-8)
-    (set-keyboard-coding-system 'utf-8)
-    (set-selection-coding-system 'utf-8)
-    (prefer-coding-system 'utf-8)
-
 * The `oc-unicode' package doesn't work with Emacs 21.

 This package tries to define more private charsets than there are free
--- a/lisp/ChangeLog
+++ b/lisp/ChangeLog
@ -1,3 +1,31 @@
+2002-07-24  Dave Love  <fx@gnu.org>
+
+	* international/mule.el (set-auto-coding): Doc fix.
+
+	* international/utf-16.el: New file.
+
+	* language/european.el ("German", "French", "Spanish", "Turkish"):
+	Add alternative coding systems.
+	("Dutch"): Likewise.  Add input method.
+	("Welsh", "Latin-7"): Add nonascii-translation.
+
+	* language/georgian.el ("Georgian"): Add nonascii-translation.
+
+	* international/titdic-cnv.el: Doc fixes.
+	(tit-process-header): Add coding cookie.
+	(titdic-convert): Force writing as iso-2022-7bit.
+
+	* international/ja-dic-cnv.el (skkdic-convert): Add coding cookie.
+
+	* international/mule-cmds.el: Doc fixes.
+	(unencodable-char-position): New.
+	(select-safe-coding-system): Use it to indicate problematic
+	characters and add extra explanation.  Avoid checking auto-coding
+	for compressed files.
+	(leim-list-header): Add coding cookie.
+	(input-method-verbose-flag): Modify :type.
+	(locale-language-names): Add bs, wa.  Modify cy.
+
 2002-07-24  Richard M. Stallman  <rms@gnu.org>

 	* emacs-lisp/bytecomp.el (byte-compile-log-warning):
--- a/lisp/international/utf-16.el
+++ b/lisp/international/utf-16.el
@ -0,0 +1,288 @@
+;;; utf-16.el --- UTF-16 encoding/decoding
+
+;; Copyright (C) 2001, 2002 Free Software Foundation, Inc.
+
+;; Author: Dave Love <fx@gnu.org>
+;; Keywords: Unicode, UTF-16, i18n
+
+;; This file is part of GNU Emacs.
+
+;; GNU Emacs is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs; see the file COPYING.  If not, write to the
+;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+;; Boston, MA 02111-1307, USA.
+
+;;; Commentary:
+
+;; Support for UTF-16, which is a two-byte encoding (modulo
+;; surrogates) of Unicode, written either in little or big endian
+;; order: coding-systems `mule-utf-16-le' and `mule-utf-16-be'.
+;; (utf-16-le is used by the DozeN'T clipboard, for instance.)  The
+;; data are preceeded by a two-byte signature which identifies their
+;; byte sex.  These are used by the coding-category-utf-16-{b,l}e code
+;; to identify the coding, but ignored on decoding.
+
+;; Note that un-decodable sequences aren't (yet?) preserved as raw
+;; bytes, as they are with utf-8, so reading and writing as utf-16 can
+;; corrupt data.
+
+;;; Code:
+
+;; We end up with trivially different -le and -be versions of most
+;; things below, sometimes with commonality abstracted into a let
+;; binding for maintenance convenience.
+
+;; We'd need new charsets distinct from ascii and eight-bit-control to
+;; deal with untranslated sequences, since we can't otherwise
+;; distinguish the bytes, as we can with utf-8.
+
+;; ;; Do a multibyte write for bytes in r3 and r4.
+;; ;; Intended for untranslatable utf-16 sequences.
+;; (define-ccl-program ccl-mule-utf-16-untrans
+;;   `(0
+;;      (if (r3 < 128)
+;; 	 (r0 = ,(charset-id 'ascii))
+;;        (if (r3 < 160)
+;; 	   (r0 = ,(charset-id 'eight-bit-control))
+;; 	 (r0 = ,(charset-id 'eight-bit-graphic))))
+;;      (if (r4 < 128)
+;; 	 (r0 = ,(charset-id 'ascii))
+;;        (if (r4 < 160)
+;; 	   (r0 = ,(charset-id 'eight-bit-control))
+;; 	 (r0 = ,(charset-id 'eight-bit-graphic))))
+;;      (r1 = r4)))
+;;   "Do a multibyte write for bytes in r3 and r4.
+;; First swap them if we're big endian, indicated by r5==0.
+;; Intended for untranslatable utf-16 sequences.")
+
+;; Needed in macro expansion, so can't be let-bound.  Zapped after use.
+(eval-and-compile
+(defconst utf-16-decode-ucs
+  ;; We have the unicode in r1.  Output is character codes in r0, r1,
+  ;; and r2 if appropriate.
+  `((lookup-integer utf-8-subst-table r0 r3)
+    (if r7 (r1 = r3))			; got a translation
+    (if (r1 < 128)
+       (r0 = ,(charset-id 'ascii))
+     (if (r1 < 160)
+	 (r0 = ,(charset-id 'eight-bit-control))
+       (if (r1 < 256)
+	   ((r0 = ,(charset-id 'latin-iso8859-1))
+	    (r1 -= 128))
+	 (if (r1 < #x2500)
+	     ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
+	      (r1 -= #x100)
+	      (r2 = (((r1 / 96) + 32) << 7))
+	      (r1 %= 96)
+	      (r1 += (r2 + 32)))
+	   (if (r1 < #x3400)
+	       ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
+		(r1 -= #x2500)
+		(r2 = (((r1 / 96) + 32) << 7))
+		(r1 %= 96)
+		(r1 += (r2 + 32)))
+	     (if (r1 < #xd800)		; 2 untranslated bytes
+;;		 ;; Assume this is rare, so don't worry about the
+;; 		 ;; overhead of the call.
+;; 		 (call mule-utf-16-untrans)
+		 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
+		  (r1 = 15037))		; U+fffd
+	       (if (r1 < #xe000)	; surrogate
+;; 			((call mule-utf-16-untrans)
+;; 			 (write-multibyte-character r0 r1)
+;; 			 (read r3 r4)
+;; 			 (call mule-utf-16-untrans))
+		   ((read r3 r4)
+		    (r0 = ,(charset-id 'mule-unicode-e000-ffff))
+		    (r1 = 15037))
+		 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
+		  (r1 -= #xe000)
+		  (r2 = (((r1 / 96) + 32) << 7))
+		  (r1 %= 96)
+		  (r1 += (r2 + 32)))))))))))))
+
+(define-ccl-program ccl-decode-mule-utf-16-le
+  `(2					; 2 bytes -> 1 to 4 bytes
+    ((read r0 r1)			; signature
+     (loop
+      (read r3 r4)
+      (r1 = (r4 <8 r3))
+      ,utf-16-decode-ucs
+      (translate-character utf-8-translation-table-for-decode r0 r1)
+      (write-multibyte-character r0 r1)
+      (repeat))))
+  "Decode little endian UTF-16 (ignoring signature bytes).
+Basic decoding is done into the charsets ascii, latin-iso8859-1 and
+mule-unicode-*.  Un-representable Unicode characters are
+decoded as U+fffd.  The result is run through translation table
+`utf-8-translation-table-for-decode' if that is defined.")
+
+(define-ccl-program ccl-decode-mule-utf-16-be
+  `(2					; 2 bytes -> 1 to 4 bytes
+    ((read r0 r1)			; signature
+     (loop
+      (read r3 r4)
+      (r1 = (r3 <8 r4))
+      ,utf-16-decode-ucs
+      (translate-character utf-8-translation-table-for-decode r0 r1)
+      (write-multibyte-character r0 r1)
+      (repeat))))
+  "Decode big endian UTF-16 (ignoring signature bytes).
+Basic decoding is done into the charsets ascii, latin-iso8859-1 and
+mule-unicode-*.  Un-representable Unicode characters are
+decoded as U+fffd.  The result is run through translation table
+`utf-8-non-latin-8859-table'.")
+
+(makunbound 'utf-16-decode-ucs)		; done with it
+
+(eval-and-compile
+(defconst utf-16-decode-to-ucs
+  ;; CCL which, given the result of a multibyte read in r0 and r1,
+  ;; sets r0 to the character's Unicode if the charset is one of the
+  ;; basic utf-8 coding system ones.  Otherwise set to U+fffd.
+  `(if (r0 == ,(charset-id 'ascii))
+       (r0 = r1)
+     (if (r0 == ,(charset-id 'latin-iso8859-1))
+	 (r0 = (r1 + 128))
+       (if (r0 == ,(charset-id 'eight-bit-control))
+	   (r0 = r1)
+	 (if (r0 == ,(charset-id 'eight-bit-graphic))
+	     (r0 = r1)
+	   ((r2 = (r1 & #x7f))
+	    (r1 >>= 7)
+	    (r3 = ((r1 - 32) * 96))
+	    (r3 += (r2 - 32))
+	    (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
+		(r0 = (r3 + #x100))
+	      (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
+		  (r0 = (r3 + #x2500))
+		(if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
+		    (r0 = (r3 + #xe000))
+		  (r0 = #xfffd)))))))))))
+
+(define-ccl-program ccl-encode-mule-utf-16-le
+  `(1
+    ((write #xff)
+     (write #xfe)
+     (loop
+      (read-multibyte-character r0 r1)
+      (translate-character ucs-mule-to-mule-unicode r0 r1)
+      ,utf-16-decode-to-ucs
+      (write (r0 & 255))
+      (write (r0 >> 8))
+      (repeat))))
+  "Encode to little endian UTF-16 with signature.
+Characters from the charsets ascii, eight-bit-control,
+eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
+after translation through the table `ucs-mule-to-mule-unicode'.
+Others are encoded as U+FFFD.")
+
+(define-ccl-program ccl-encode-mule-utf-16-be
+  `(1
+    ((write #xfe)
+     (write #xff)
+     (loop
+      (read-multibyte-character r0 r1)
+      (translate-character ucs-mule-to-mule-unicode r0 r1)
+      ,utf-16-decode-to-ucs
+      (write (r0 >> 8))
+      (write (r0 & 255))
+      (repeat))))
+  "Encode to big endian UTF-16 with signature.
+Characters from the charsets ascii, eight-bit-control,
+eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are encoded
+after translation through the table `ucs-mule-to-mule-unicode'.
+Others are encoded as U+FFFD.")
+
+(makunbound 'utf-16-decode-to-ucs)
+
+(defun utf-16-le-pre-write-conversion (beg end)
+  "Semi-dummy pre-write function effectively to autoload ucs-tables."
+  ;; Ensure translation table is loaded.
+  (require 'ucs-tables)
+  ;; Don't do this again.
+  (coding-system-put 'mule-utf-16-le 'pre-write-conversion nil)
+  nil)
+
+(defun utf-16-be-pre-write-conversion (beg end)
+  "Semi-dummy pre-write function effectively to autoload ucs-tables."
+  ;; Ensure translation table is loaded.
+  (require 'ucs-tables)
+  ;; Don't do this again.
+  (coding-system-put 'mule-utf-16-be 'pre-write-conversion nil)
+  nil)
+
+(let ((doc "
+
+Assumes and ignores the leading two-byte signature.
+
+The supported Emacs character sets are the following, plus others
+which may be included in the translation table
+`ucs-mule-to-mule-unicode':
+ ascii
+ eight-bit-control
+ latin-iso8859-1
+ mule-unicode-0100-24ff
+ mule-unicode-2500-33ff
+ mule-unicode-e000-ffff
+
+Note that Unicode characters out of the ranges U+0000-U+33FF and
+U+E200-U+FFFF are decoded as U+FFFD, effectively corrupting the data
+if they are re-encoded.  Emacs characters without Unicode conversions
+are encoded as U+FFFD."))
+  (make-coding-system
+   'mule-utf-16-le 4
+   ?u	      ; Mule-UCS uses ?U, but code-pages uses that for koi8-u.
+   (concat
+    "Little endian UTF-16 encoding for Emacs-supported Unicode characters."
+    doc)
+
+   '(ccl-decode-mule-utf-16-le . ccl-encode-mule-utf-16-le)
+   '((safe-charsets
+      ascii
+      eight-bit-control
+      latin-iso8859-1
+      mule-unicode-0100-24ff
+      mule-unicode-2500-33ff
+      mule-unicode-e000-ffff)
+     (mime-charset . utf-16le)
+     (coding-category . coding-category-utf-16-le)
+     (valid-codes (0 . 255))
+     (pre-write-conversion . utf-16-le-pre-write-conversion)))
+
+  (make-coding-system
+   'mule-utf-16-be 4 ?u
+   (concat
+    "Big endian UTF-16 encoding for Emacs-supported Unicode characters."
+    doc)
+
+   '(ccl-decode-mule-utf-16-be . ccl-encode-mule-utf-16-be)
+   '((safe-charsets
+      ascii
+      eight-bit-control
+      latin-iso8859-1
+      mule-unicode-0100-24ff
+      mule-unicode-2500-33ff
+      mule-unicode-e000-ffff)
+     (mime-charset . utf-16be)
+     (coding-category . coding-category-utf-16-be)
+     (valid-codes (0 . 255))
+     (pre-write-conversion . utf-16-be-pre-write-conversion)))
+  )
+
+(define-coding-system-alias 'utf-16-le 'mule-utf-16-le)
+(define-coding-system-alias 'utf-16-be 'mule-utf-16-be)
+
+(provide 'utf-16)
+
+;;; utf-16.el ends here