Improve the encoding by compound-text-with-extensions.

2010-08-06 17:11:19 +09:00 · 2010-08-06 17:11:19 +09:00 · 7a84eee5b7
commit 7a84eee5b7
parent faa28da9b7
5 changed files with 147 additions and 76 deletions
--- a/lisp/ChangeLog
+++ b/lisp/ChangeLog
@ -1,3 +1,14 @@
+2010-08-06  Kenichi Handa  <handa@m17n.org>
+
+	* international/mule.el (define-charset): Store NAME as :base
+	property.
+	(ctext-non-standard-encodings-table): Pay attention to charset
+	aliases.
+	(ctext-pre-write-conversion): Sort ctext-standard-encodings by the
+	current priority.  Force using the designation of the specific
+	charset by adding `charset' text property.  Improve the whole
+	algorithm.
+
 2010-08-04  Kenichi Handa  <handa@m17n.org>

 	* language/cyrillic.el: Don't add "microsoft-cp1251" to
--- a/lisp/international/mule.el
+++ b/lisp/international/mule.el
@ -282,6 +282,7 @@ attribute."
 	(plist-put props :short-name (symbol-name name)))
    (or (plist-get props :long-name)
 	(plist-put props :long-name (plist-get props :short-name)))
+    (plist-put props :base name)
    ;; We can probably get a worthwhile amount in purespace.
    (setq props
 	  (mapcar (lambda (elt)
@ -1535,11 +1536,13 @@ of `ctext-non-standard-encodings-alist'.")
      (let* ((slot (assoc elt ctext-non-standard-encodings-alist))
 	     (charset (nth 3 slot)))
 	(if (charsetp charset)
-	    (setcar tail (cons charset slot))
+	    (setcar tail
+		    (cons (plist-get (charset-plist charset) :base) slot))
 	  (setcar tail (cons (car charset) slot))
 	  (dolist (cs (cdr charset))
 	    (setcdr tail
-		    (cons (cons (car cs) slot) (cdr tail)))
+		    (cons (cons (plist-get (charset-plist (car cs)) :base) slot)
+			  (cdr tail)))
 	    (setq tail (cdr tail))))
 	(setq tail (cdr tail))))
    table))
@ -1559,74 +1562,56 @@ in-place."
      (setq from 1 to (point-max)))
    (save-restriction
      (narrow-to-region from to)
+      (goto-char from)
      (let ((encoding-table (ctext-non-standard-encodings-table))
-	    (charset-list ctext-standard-encodings)
+	    (charset-list (sort-charsets
+			   (copy-sequence ctext-standard-encodings)))
+	    (end-pos (make-marker))
 	    last-coding-system-used
-	    last-pos last-encoding-info
-	    encoding-info end-pos ch charset)
+	    last-pos charset encoding-info)
 	(dolist (elt encoding-table)
 	  (push (car elt) charset-list))
-	(goto-char (setq last-pos from))
 	(setq end-pos (point-marker))
-	(while (re-search-forward "[^\000-\177]+" nil t)
+	(while (re-search-forward "[^\0-\177]+" nil t)
 	  ;; Found a sequence of non-ASCII characters.
-	  (setq last-pos (match-beginning 0)
-		ch (char-after last-pos)
-		charset (char-charset ch charset-list)
-		last-encoding-info
-		(if charset
-		    (or (cdr (assq charset encoding-table))
-			charset)
-		  'utf-8))
 	  (set-marker end-pos (match-end 0))
-	  (goto-char (1+ last-pos))
-	  (while (marker-position end-pos)
-	    (if (< (point) end-pos)
-		(progn
-		  (setq charset (char-charset (following-char) charset-list)
-			encoding-info
-			(if charset
-			    (or (cdr (assq charset encoding-table))
-				charset)
-			  'utf-8))
-		  (forward-char 1))
-	      (setq encoding-info nil)
-	      (set-marker end-pos nil))
-	    (unless (eq last-encoding-info encoding-info)
-	      (cond ((consp last-encoding-info)
-		     ;; Encode the previous range using an extended
-		     ;; segment.
-		     (let ((encoding-name (car last-encoding-info))
-			   (coding-system (nth 1 last-encoding-info))
-			   (noctets (nth 2 last-encoding-info))
-			   len)
-		       (encode-coding-region last-pos (point) coding-system)
-		       (setq len (+ (length encoding-name) 1
-				    (- (point) last-pos)))
-		       ;; According to the spec of CTEXT, it is not
-		       ;; necessary to produce this extra designation
-		       ;; sequence, but some buggy application
-		       ;; (e.g. crxvt-gb) requires it.
-		       (insert "\e(B")
-		       (save-excursion
-			 (goto-char last-pos)
-			 (insert (format "\e%%/%d" noctets))
-			 (insert-byte (+ (/ len 128) 128) 1)
-			 (insert-byte (+ (% len 128) 128) 1)
-			 (insert encoding-name)
-			 (insert 2))))
-		    ((eq last-encoding-info 'utf-8)
-		     ;; Encode the previous range using UTF-8 encoding
-		     ;; extention.
-		     (encode-coding-region last-pos (point) 'mule-utf-8)
-		     (save-excursion
-		       (goto-char last-pos)
-		       (insert "\e%G"))
-		     (insert "\e%@"))
-		    (t
-		     (put-text-property last-pos (point) 'charset charset)))
-	      (setq last-pos (point)
-		    last-encoding-info encoding-info))))
+	  (goto-char (match-beginning 0))
+	  (setq last-pos (point)
+		charset (char-charset (following-char) charset-list))
+	  (forward-char 1)
+	  (while (and (< (point) end-pos)
+		      (eq charset (char-charset (following-char) charset-list)))
+	    (forward-char 1))
+	  (if charset
+	      (if (setq encoding-info (cdr (assq charset encoding-table)))
+		  ;; Encode this range using an extended segment.
+		  (let ((encoding-name (car encoding-info))
+			(coding-system (nth 1 encoding-info))
+			(noctets (nth 2 encoding-info))
+			len)
+		    (encode-coding-region last-pos (point) coding-system)
+		    (setq len (+ (length encoding-name) 1
+				 (- (point) last-pos)))
+		    ;; According to the spec of CTEXT, it is not
+		    ;; necessary to produce this extra designation
+		    ;; sequence, but some buggy application
+		    ;; (e.g. crxvt-gb) requires it.
+		    (insert "\e(B")
+		    (save-excursion
+		      (goto-char last-pos)
+		      (insert (format "\e%%/%d" noctets))
+		      (insert-byte (+ (/ len 128) 128) 1)
+		      (insert-byte (+ (% len 128) 128) 1)
+		      (insert encoding-name)
+		      (insert 2)))
+		;; Encode this range as characters in CHARSET.
+		(put-text-property last-pos (point) 'charset charset))
+	    ;; Encode this range using UTF-8 encoding extention.
+	    (encode-coding-region last-pos (point) 'mule-utf-8)
+	    (save-excursion
+	      (goto-char last-pos)
+	      (insert "\e%G"))
+	    (insert "\e%@")))
 	(goto-char (point-min)))))
  ;; Must return nil, as build_annotations_2 expects that.
  nil)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@ -1,3 +1,14 @@
+2010-08-06  Kenichi Handa  <handa@m17n.org>
+
+	* charset.c: Include <stdlib.h>
+	(struct charset_sort_data): New struct.
+	(charset_compare): New function.
+	(Fsort_charsets): New funciton.
+	(syms_of_charset): Declare Fsort_charsets as a Lisp function.
+
+	* coding.c (decode_coding_iso_2022): Fix checking of dimension
+	number in CTEXT extended segment.
+
 2010-08-01  Juanma Barranquero  <lekktu@gmail.com>

 	* w32fns.c (syms_of_w32fns) <x-max-tooltip-size>: Fix typo in docstring.
--- a/src/charset.c
+++ b/src/charset.c
@ -28,6 +28,7 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 #include <config.h>

 #include <stdio.h>
+#include <stdlib.h>
 #include <unistd.h>
 #include <ctype.h>
 #include <sys/types.h>
@ -2139,23 +2140,22 @@ that case, find the charset from what supported by that coding system.  */)
    charset = CHAR_CHARSET (XINT (ch));
  else
    {
-      Lisp_Object charset_list;
-
      if (CONSP (restriction))
 	{
-	  for (charset_list = Qnil; CONSP (restriction);
-	       restriction = XCDR (restriction))
-	    {
-	      int id;
+	  int c = XFASTINT (ch);

-	      CHECK_CHARSET_GET_ID (XCAR (restriction), id);
-	      charset_list = Fcons (make_number (id), charset_list);
+	  for (; CONSP (restriction); restriction = XCDR (restriction))
+	    {
+	      struct charset *charset;
+
+	      CHECK_CHARSET_GET_CHARSET (XCAR (restriction), charset);
+	      if (ENCODE_CHAR (charset, c) != CHARSET_INVALID_CODE (charset))
+		return XCAR (restriction);
 	    }
-	  charset_list = Fnreverse (charset_list);
+	  return Qnil;
 	}
-      else
-	charset_list = coding_system_charset_list (restriction);
-      charset = char_charset (XINT (ch), charset_list, NULL);
+      restriction = coding_system_charset_list (restriction);
+      charset = char_charset (XINT (ch), restriction, NULL);
      if (! charset)
 	return Qnil;
    }
@ -2312,6 +2312,69 @@ Return charset identification number of CHARSET.  */)
  return make_number (id);
 }

+struct charset_sort_data
+{
+  Lisp_Object charset;
+  int id;
+  int priority;
+};
+
+static int
+charset_compare (const void *d1, const void *d2)
+{
+  const struct charset_sort_data *data1 = d1, *data2 = d2;
+  return (data1->priority - data2->priority);
+}
+
+DEFUN ("sort-charsets", Fsort_charsets, Ssort_charsets, 1, 1, 0,
+       doc: /* Sort charset list CHARSETS by a priority of each charset.
+Return the sorted list.  CHARSETS is modified by side effects.
+See also `charset-priority-list' and `set-charset-priority'.  */)
+     (Lisp_Object charsets)
+{
+  Lisp_Object len = Flength (charsets);
+  int n = XFASTINT (len), i, j, done;
+  Lisp_Object tail, elt, attrs;
+  struct charset_sort_data *sort_data;
+  int id, min_id, max_id;
+  USE_SAFE_ALLOCA;
+
+  if (n == 0)
+    return Qnil;
+  SAFE_ALLOCA (sort_data, struct charset_sort_data *, sizeof (*sort_data) * n);
+  for (tail = charsets, i = 0; CONSP (tail); tail = XCDR (tail), i++)
+    {
+      elt = XCAR (tail);
+      CHECK_CHARSET_GET_ATTR (elt, attrs);
+      sort_data[i].charset = elt;
+      sort_data[i].id = id = XINT (CHARSET_ATTR_ID (attrs));
+      if (i == 0)
+	min_id = max_id = id;
+      else if (id < min_id)
+	min_id = id;
+      else if (id > max_id)
+	max_id = id;
+    }
+  for (done = 0, tail = Vcharset_ordered_list, i = 0;
+       done < n && CONSP (tail); tail = XCDR (tail), i++)
+    {
+      elt = XCAR (tail);
+      id = XFASTINT (elt);
+      if (id >= min_id && id <= max_id)
+	for (j = 0; j < n; j++)
+	  if (sort_data[j].id == id)
+	    {
+	      sort_data[j].priority = i;
+	      done++;
+	    }
+    }
+  qsort (sort_data, n, sizeof *sort_data, charset_compare);
+  for (i = 0, tail = charsets; CONSP (tail); tail = XCDR (tail), i++)
+    XSETCAR (tail, sort_data[i].charset);
+  SAFE_FREE ();
+  return charsets;
+}
+

 void
 init_charset ()
@ -2414,6 +2477,7 @@ syms_of_charset ()
  defsubr (&Scharset_priority_list);
  defsubr (&Sset_charset_priority);
  defsubr (&Scharset_id_internal);
+  defsubr (&Ssort_charsets);

  DEFVAR_LISP ("charset-map-path", &Vcharset_map_path,
 	       doc: /* *List of directories to search for charset map files.  */);
--- a/src/coding.c
+++ b/src/coding.c
@ -3935,7 +3935,7 @@ decode_coding_iso_2022 (coding)
 		  int size;

 		  ONE_MORE_BYTE (dim);
-		  if (dim < 0 || dim > 4)
+		  if (dim < '0' || dim > '4')
 		    goto invalid_code;
 		  ONE_MORE_BYTE (M);
 		  if (M < 128)