Improve locale and language environment setting at startup

* lisp/international/mule-cmds.el (locale-language-names): Add more locales and their language environments. (set-locale-environment): Use w32-multibyte-code-page, if non-zero, as locale-coding-system. (Bug#34684) * src/w32fns.c (globals_of_w32fns) <w32-multibyte-code-page>: New variable. * etc/NEWS: Mention w32-multibyte-code-page.
2019-03-16 13:59:03 +02:00 · 2019-03-16 13:59:03 +02:00 · 34dd4e0a83
commit 34dd4e0a83
parent 164b78c714
3 changed files with 97 additions and 32 deletions
--- a/etc/NEWS
+++ b/etc/NEWS
@ -1736,6 +1736,14 @@ versions of MS-Windows.  Set this variable to 50 if for some reason
 you need the old behavior (and please report such situations to Emacs
 developers).

+---
+** New variable 'w32-multibyte-code-page'.
+This variable holds the value of the multibyte code page used by the
+system.  It is usually zero, which indicates that 'w32-ansi-code-page'
+is being used, except in Far Eastern locales.  When this variable is
+non-zero, Emacs at startup sets 'locale-coding-system' to the
+corresponding encoding, instead of using 'w32-ansi-code-page'.
+
 +++
 ** On NS the behaviour of drag and drop can now be modified by use of
 modifier keys in line with Apples guidelines.  This makes the drag and
--- a/lisp/international/mule-cmds.el
+++ b/lisp/international/mule-cmds.el
@ -2181,22 +2181,27 @@ See `set-language-info-alist' for use in programs."
 (defconst locale-language-names
  (purecopy
   '(
-    ;; Locale names of the form LANGUAGE[_TERRITORY][.CODESET][@MODIFIER]
-    ;; as specified in the Single Unix Spec, Version 2.
-    ;; LANGUAGE is a language code taken from ISO 639:1988 (E/F)
-    ;; with additions from ISO 639/RA Newsletter No.1/1989;
-    ;; see Internet RFC 2165 (1997-06) and
-    ;; http://www.evertype.com/standards/iso639/iso639-en.html
-    ;; TERRITORY is a country code taken from ISO 3166
-    ;; http://www.din.de/gremien/nas/nabd/iso3166ma/codlstp1/en_listp1.html.
-    ;; CODESET and MODIFIER are implementation-dependent.
+     ;; Locale names of the form LANGUAGE[_TERRITORY][.CODESET][@MODIFIER]
+     ;; as specified in the Single Unix Spec, Version 2.
+     ;; LANGUAGE is a language code taken from ISO 639:1988 (E/F)
+     ;; with additions from ISO 639/RA Newsletter No.1/1989;
+     ;; see Internet RFC 2165 (1997-06) and
+     ;; http://www.evertype.com/standards/iso639/iso639-en.html
+     ;; TERRITORY is a country code taken from ISO 3166
+     ;; http://www.din.de/gremien/nas/nabd/iso3166ma/codlstp1/en_listp1.html.
+     ;; CODESET and MODIFIER are implementation-dependent.
+
+     ;; Language names for which there are no locales (yet) are
+     ;; commented out.

     ;; jasonr comments: MS Windows uses three letter codes for
     ;; languages instead of the two letter ISO codes that POSIX
-     ;; uses. In most cases the first two letters are the same, so
-     ;; most of the regexps in locale-language-names work. Japanese
-     ;; and Chinese are exceptions, which are listed in the
-     ;; non-standard section at the bottom of locale-language-names.
+     ;; uses.  In most cases the first two letters are the same, so
+     ;; most of the regexps in locale-language-names work.  Japanese,
+     ;; Chinese, and some others are exceptions, which are listed in the
+     ;; non-standard section at the bottom of locale-language-names, or
+     ;; in the main section, if otherwise we would pick up the wrong
+     ;; entry (because the first matching entry is used).

    ("aa_DJ" . "Latin-1") ; Afar
    ("aa" . "UTF-8")
@ -2204,11 +2209,12 @@ See `set-language-info-alist' for use in programs."
    ("af" . "Latin-1") ; Afrikaans
    ("am" "Ethiopic" utf-8) ; Amharic
    ("an" . "Latin-9") ; Aragonese
+    ("arn" . "UTF-8") ; MS-Windows Mapudungun, Mapuche
    ("ar" . "Arabic")
-    ; as Assamese
+    ("as" . "UTF-8") ; Assamese
    ; ay Aymara
    ("az" . "UTF-8") ; Azerbaijani
-    ; ba Bashkir
+    ("ba" . "UTF-8") ; Bashkir, Cyrillic script
    ("be" "Belarusian" cp1251) ; Belarusian [Byelorussian until early 1990s]
    ("bg" "Bulgarian" cp1251) ; Bulgarian
    ; bh Bihari
@ -2219,12 +2225,12 @@ See `set-language-info-alist' for use in programs."
    ("bs" . "Latin-2") ; Bosnian
    ("byn" . "UTF-8")  ; Bilin; Blin
    ("ca" "Catalan" iso-8859-1) ; Catalan
-    ; co Corsican
+    ("co" . "UTF-8") ; Corsican
    ("cs" "Czech" iso-8859-2)
    ("cy" "Welsh" iso-8859-14)
    ("da" . "Latin-1") ; Danish
    ("de" "German" iso-8859-1)
-    ; dv Divehi
+    ("dv" . "UTF-8") ; Divehi
    ; dz Bhutani
    ("ee" . "Latin-4") ; Ewe
    ("el" "Greek" iso-8859-7)
@ -2238,6 +2244,8 @@ See `set-language-info-alist' for use in programs."
    ("et" . "Latin-9") ; Estonian
    ("eu" . "Latin-1") ; Basque
    ("fa" "Persian" utf-8) ; Persian
+    ("fil" . "UTF-8") ; Filipino
+    ("fpo" . "UTF-8") ; MS-Windows Filipino
    ("fi" . "Latin-9") ; Finnish
    ("fj" . "Latin-1") ; Fiji
    ("fo" . "Latin-1") ; Faroese
@ -2246,6 +2254,7 @@ See `set-language-info-alist' for use in programs."
    ("ga" . "Latin-1") ; Irish Gaelic (new orthography)
    ("gd" . "Latin-9") ; Scots Gaelic
    ("gez" "Ethiopic" utf-8) ; Geez
+    ("gla" . "Latin-9") ; MS-Windows Scots Gaelic
    ("gl" . "Latin-1") ; Gallegan; Galician
    ; gn Guarani
    ("gu" "Gujarati" utf-8) ; Gujarati
@ -2256,27 +2265,33 @@ See `set-language-info-alist' for use in programs."
    ("hni_IN" . "UTF-8") ; Chhattisgarhi
    ("hr" "Croatian" iso-8859-2) ; Croatian
    ("hu" . "Latin-2") ; Hungarian
-    ; hy Armenian
+    ("hy" . "UTF-8") ;  Armenian
    ; ia Interlingua
    ("id" . "Latin-1") ; Indonesian
    ; ie Interlingue
-    ; ik Inupiak
+    ("ig" . "UTF-8") ; Igbo (Nigeria)
+    ("ibo" . "UTF-8") ; MS-Windows Igbo
+    ; ik Inupiak, Inupiaq
    ("is" . "Latin-1") ; Icelandic
    ("it" "Italian" iso-8859-1) ; Italian
    ; iu Inuktitut
    ("iw" "Hebrew" iso-8859-8)
    ("ja" "Japanese" euc-jp)
    ; jw Javanese
+    ("kal" . "Latin-1") ; MS-Windows Greenlandic
    ("ka" "Georgian" georgian-ps) ; Georgian
-    ; kk Kazakh
+    ("kk" . "UTF-8") ; Kazakh
    ("kl" . "Latin-1") ; Greenlandic
    ("km" "Khmer" utf-8) ; Cambodian, Khmer
+    ("knk" "Devanagari" utf-8) ; MS-Windows Konkani
+    ("kok" "Devanagari" utf-8) ; Konkani
    ("kn" "Kannada" utf-8)
    ("ko" "Korean" euc-kr)
    ("ks" . "UTF-8") ; Kashmiri
    ; ku Kurdish
    ("kw" . "Latin-1") ; Cornish
    ("ky" . "UTF-8") ; Kirghiz
+    ("lao" "Lao" utf-8) ; MS-Windows Lao
    ("la" . "Latin-1") ; Latin
    ("lb" . "Latin-1") ; Luxemburgish
    ("lg" . "Latin-6") ; Ganda, a.k.a. Luganda
@ -2287,18 +2302,22 @@ See `set-language-info-alist' for use in programs."
    ; mg Malagasy
    ("mi" . "Latin-7") ; Maori
    ("mk" "Cyrillic-ISO" iso-8859-5) ; Macedonian
+    ("mlt" . "Latin-3") ; MS-Windows Maltese
    ("ml" "Malayalam" utf-8)
    ("mn" . "UTF-8") ; Mongolian
-    ; mo Moldavian
+    ; mo Moldavian (retired)
+    ("mri" . "Latin-7") ; MS-Windows Maori
    ("mr" "Devanagari" utf-8) ; Marathi
    ("ms" . "Latin-1") ; Malay
    ("mt" . "Latin-3") ; Maltese
+    ("mym" "Malayalam" utf-8) ; MS-Windows Malayalam
    ("my" "Burmese" utf-8) ; Burmese
    ; na Nauru
    ("nb" . "Latin-1") ; Norwegian
    ("ne" "Devanagari" utf-8) ; Nepali
    ("nl" "Dutch" iso-8859-1)
    ("nn" . "Latin-1") ; Norwegian Nynorsk
+    ("non" . "Latin-1") ; MS-Windows Norwegian Nynorsk
    ("no" . "Latin-1") ; Norwegian
    ("nr_ZA" . "UTF-8") ; South Ndebele
    ("nso_ZA" . "UTF-8") ; Pedi
@ -2308,7 +2327,8 @@ See `set-language-info-alist' for use in programs."
    ("or" "Oriya" utf-8)
    ("pa" "Punjabi" utf-8) ; Punjabi
    ("pl" "Polish" iso-8859-2) ; Polish
-    ; ps Pashto, Pushto
+    ("ps" . "UTF-8") ; Pashto, Pushto
+    ("pas" . "UTF-8") ; MS-Windows Pashto
    ("pt_BR" "Brazilian Portuguese" iso-8859-1) ; Brazilian Portuguese
    ("pt" . "Latin-1") ; Portuguese
    ; qu Quechua
@ -2318,7 +2338,7 @@ See `set-language-info-alist' for use in programs."
    ("ru_RU.koi8r" "Cyrillic-KOI8" koi8-r)
    ("ru_RU" "Russian" iso-8859-5)
    ("ru_UA" "Russian" koi8-u)
-    ; rw Kinyarwanda
+    ("rw" . "UTF-8") ; Kinyarwanda
    ("sa" . "Devanagari") ; Sanskrit
    ; sd Sindhi
    ("se" . "UTF-8") ; Northern Sami
@ -2339,6 +2359,7 @@ See `set-language-info-alist' for use in programs."
    ; su Sundanese
    ("sv" "Swedish" iso-8859-1)		; Swedish
    ("sw" . "Latin-1") ; Swahili
+    ("taj" "Tajik" koi8-t) ; MS-Windows Tajik w/Cyrillic script
    ("ta" "Tamil" utf-8)
    ("te" "Telugu" utf-8) ; Telugu
    ("tg" "Tajik" koi8-t)
@ -2348,15 +2369,17 @@ See `set-language-info-alist' for use in programs."
    ("th" "Thai" iso-8859-11)
    ("ti" "Ethiopic" utf-8) ; Tigrinya
    ("tig_ER" . "UTF-8") ; Tigre
-    ; tk Turkmen
+    ("tk" . "Latin-5") ; Turkmen
+    ("tuk" . "Latin-5") ; MS-Windows Turkmen
    ("tl" . "Latin-1") ; Tagalog
    ("tn" . "Latin-9") ; Setswana, Tswana
    ; to Tonga
    ("tr" "Turkish" iso-8859-9)
+    ("tsn" . "Latin-9") ; MS-Windows Tswana
    ("ts" . "Latin-1") ; Tsonga
    ("tt" . "UTF-8") ; Tatar
    ; tw Twi
-    ; ug Uighur
+    ("ug" . "UTF-8") ; Uighur
    ("uk" "Ukrainian" koi8-u)
    ("ur" . "UTF-8") ; Urdu
    ("uz_UZ@cyrillic" . "UTF-8"); Uzbek
@ -2365,10 +2388,10 @@ See `set-language-info-alist' for use in programs."
    ("vi" "Vietnamese" utf-8)
    ; vo Volapuk
    ("wa" . "Latin-1") ; Walloon
-    ; wo Wolof
+    ("wo" . "UTF-8") ; Wolof
    ("xh" . "Latin-1") ; Xhosa
    ("yi" . "Windows-1255") ; Yiddish
-    ; yo Yoruba
+    ("yo" . "UTF-8") ; Yoruba
    ; za Zhuang
    ("zh_HK" . "Chinese-Big5")
    ; zh_HK/BIG5-HKSCS \
@ -2378,6 +2401,9 @@ See `set-language-info-alist' for use in programs."
    ("zh_CN.GB18030" "Chinese-GB18030")
    ("zh_CN.UTF-8" . "Chinese-GBK")
    ("zh_CN" . "Chinese-GB")
+    ("zhh" . "Chinese-Big5") ; MS-Windows Chinese (Hong Kong S.A.R.)
+    ("zhi" . "Chinese-GBK") ; MS-Windows Chinese (Singapore)
+    ("zhm" . "Chinese-Big5") ; MS-Windows Chinese (Macao S.A.R.)
    ("zh" . "Chinese-GB")
    ("zu" . "Latin-1") ; Zulu

@ -2395,12 +2421,23 @@ See `set-language-info-alist' for use in programs."
    ("sp" . "Cyrillic-ISO") ; Serbian (Cyrillic alphabet), e.g. X11R6.4
    ("su" . "Latin-1") ; Finnish, e.g. Solaris 2.6
    ("jp" . "Japanese") ; e.g. MS Windows
-    ("chs" . "Chinese-GBK") ; MS Windows Chinese Simplified
-    ("cht" . "Chinese-BIG5") ; MS Windows Chinese Traditional
+    ("chs" . "Chinese-GBK") ; MS Windows Chinese Simplified (PRC)
+    ("cht" . "Chinese-BIG5") ; MS Windows Chinese Traditional (Taiwan)
    ("gbz" . "UTF-8") ; MS Windows Dari Persian
    ("div" . "UTF-8") ; MS Windows Divehi (Maldives)
    ("wee" . "Latin-2") ; MS Windows Lower Sorbian
    ("wen" . "Latin-2") ; MS Windows Upper Sorbian
+    ("ind" . "Latin-1") ; MS-Windows Indonesian
+    ("sme" . "UTF-8") ; MS-Windows Northern Sami (Norway)
+    ("smf" . "UTF-8") ; MS-Windows Northern Sami (Sweden)
+    ("smg" . "ITF-8") ; MS-Windows Northern Sami (Finland)
+    ("kdi" "Kannada" utf-8) ; MS-Windows Kannada
+    ("mar" "Devanagari" utf-8) ; MS-Windows Marathi
+    ("khm" "Khmer" utf-8) ; MS-Windows Khmer
+    ("iri" . "Latin-1") ; MS-Windows Irish Gaelic
+    ; mwk  MS-Windows Mohawk (Canada)
+    ("uig" . "UTF-8") ; MS-Windows Uighur
+    ("kin" . "UTF-8") ;  MS-Windows Kinyarwanda
    ))
  "Alist of locale regexps vs the corresponding languages and coding systems.
 Each element has this form:
@ -2702,10 +2739,20 @@ See also `locale-charset-language-names', `locale-language-names',
             (output-coding
              (if noninteractive
                  (intern (format "cp%d" (w32-get-console-output-codepage)))
-                code-page-coding)))
-	(when (coding-system-p code-page-coding)
+                code-page-coding))
+             (multibyte-code-page-coding
+              (or (and (boundp 'w32-multibyte-code-page)
+                       (not (zerop w32-multibyte-code-page))
+                       (intern (format "cp%d" w32-multibyte-code-page)))
+                  code-page-coding))
+             (locale-coding
+              (if noninteractive
+                  code-page-coding
+                multibyte-code-page-coding)))
+	(when (and (coding-system-p code-page-coding)
+                   (coding-system-p locale-coding))
          (or output-coding (setq output-coding code-page-coding))
-	  (unless frame (setq locale-coding-system code-page-coding))
+	  (unless frame (setq locale-coding-system locale-coding))
 	  (set-keyboard-coding-system code-page-coding frame)
 	  (set-terminal-coding-system output-coding frame)
 	  (setq default-file-name-coding-system ansi-code-page-coding))))
--- a/src/w32fns.c
+++ b/src/w32fns.c
@ -48,6 +48,7 @@ along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */

 #ifdef WINDOWSNT
 #include <mbstring.h>
+#include <mbctype.h>	/* for _getmbcp */
 #endif /* WINDOWSNT */

 #if CYGWIN
@ -10908,6 +10909,15 @@ globals_of_w32fns (void)
 	      doc: /* The ANSI code page used by the system.  */);
  w32_ansi_code_page = GetACP ();

+#ifndef CYGWIN
+  DEFVAR_INT ("w32-multibyte-code-page",
+	      w32_multibyte_code_page,
+	      doc: /* The current multibyte code page used by the system.
+A value of zero indicates that the single-byte code page is in use,
+see `w32-ansi-code-page'.  */);
+  w32_multibyte_code_page = _getmbcp ();
+#endif
+
  if (os_subtype == OS_NT)
    w32_unicode_gui = 1;
  else