Implement case-insensitive and Unicode-compliant collation on MS-Windows.

src/fns.c (Fstring_collate_lessp, Fstring_collate_equalp): Doc fix.
 src/w32proc.c (w32_compare_strings): Accept additional argument
 IGNORE_CASE.  Set up the flags for CompareStringW to ignore case
 if requested.  If w32-collate-ignore-punctuation is non-nil, add
 NORM_IGNORESYMBOLS to the flags.
 (LINGUISTIC_IGNORECASE): Define if not already defined.
 (syms_of_ntproc) <Vw32_collate_ignore_punctuation>: New variable.
 src/sysdep.c (str_collate) [WINDOWSNT]: Adapt to the interface
 change.
 src/w32.h: Adjust prototype of w32_compare_strings.

 etc/NEWS: Mention w32-collate-ignore-punctuation.

Fixes: debbugs:18051
This commit is contained in:
Eli Zaretskii 2014-08-29 22:18:06 +03:00
parent 2ae366c73e
commit 21ba51de76
7 changed files with 80 additions and 14 deletions

View file

@ -1,3 +1,7 @@
2014-08-29 Eli Zaretskii <eliz@gnu.org>
* NEWS: Mention w32-collate-ignore-punctuation.
2014-08-29 Dmitry Antipov <dmantipov@yandex.ru>
* NEWS: Mention that `sort' can handle vectors.

View file

@ -72,6 +72,13 @@ environment. For the time being this is implemented for modern POSIX
systems and for MS-Windows, for other systems they fall back to their
counterparts `string-lessp' and `string-equal'.
*** The MS-Windows specific variable `w32-collate-ignore-punctuation',
if set to a non-nil value, causes the above 2 functions to ignore
symbol and punctuation characters when collating strings. This
emulates the behavior of modern Posix platforms when the locale's
codeset is "UTF-8" (as in "en_US.UTF-8"). This is needed because
MS-Windows doesn't support UTF-8 as codeset in its locales.
* Editing Changes in Emacs 24.5

View file

@ -1,3 +1,17 @@
2014-08-29 Eli Zaretskii <eliz@gnu.org>
* fns.c (Fstring_collate_lessp, Fstring_collate_equalp): Doc fix.
* w32proc.c (w32_compare_strings): Accept additional argument
IGNORE_CASE. Set up the flags for CompareStringW to ignore case
if requested. If w32-collate-ignore-punctuation is non-nil, add
NORM_IGNORESYMBOLS to the flags.
(LINGUISTIC_IGNORECASE): Define if not already defined.
(syms_of_ntproc) <Vw32_collate_ignore_punctuation>: New variable.
* sysdep.c (str_collate) [WINDOWSNT]: Adapt to the interface
change.
2014-08-29 Michael Albinus <michael.albinus@gmx.de>
* sysdep.c (LC_CTYPE, LC_CTYPE_MASK, towlower_l):

View file

@ -350,7 +350,7 @@ Symbols are also allowed; their print names are used instead.
This function obeys the conventions for collation order in your
locale settings. For example, punctuation and whitespace characters
are considered less significant for sorting:
might be considered less significant for sorting:
\(sort '\("11" "12" "1 1" "1 2" "1.1" "1.2") 'string-collate-lessp)
=> \("11" "1 1" "1.1" "12" "1 2" "1.2")
@ -358,11 +358,15 @@ are considered less significant for sorting:
The optional argument LOCALE, a string, overrides the setting of your
current locale identifier for collation. The value is system
dependent; a LOCALE \"en_US.UTF-8\" is applicable on POSIX systems,
while it would be \"English_USA.1252\" on MS Windows systems.
while it would be, e.g., \"enu_USA.1252\" on MS-Windows systems.
If IGNORE-CASE is non-nil, characters are converted to lower-case
before comparing them.
To emulate Unicode-compliant collation on MS-Windows systems,
bind `w32-collate-ignore-punctuation' to a non-nil value, since
the codeset part of the locale cannot be \"UTF-8\" on MS-Windows.
If your system does not support a locale environment, this function
behaves like `string-lessp'. */)
(Lisp_Object s1, Lisp_Object s2, Lisp_Object locale, Lisp_Object ignore_case)
@ -391,8 +395,8 @@ Symbols are also allowed; their print names are used instead.
This function obeys the conventions for collation order in your locale
settings. For example, characters with different coding points but
the same meaning are considered as equal, like different grave accent
unicode characters:
the same meaning might be considered as equal, like different grave
accent Unicode characters:
\(string-collate-equalp \(string ?\\uFF40) \(string ?\\u1FEF))
=> t
@ -400,13 +404,20 @@ unicode characters:
The optional argument LOCALE, a string, overrides the setting of your
current locale identifier for collation. The value is system
dependent; a LOCALE \"en_US.UTF-8\" is applicable on POSIX systems,
while it would be \"English_USA.1252\" on MS Windows systems.
while it would be \"enu_USA.1252\" on MS Windows systems.
If IGNORE-CASE is non-nil, characters are converted to lower-case
before comparing them.
To emulate Unicode-compliant collation on MS-Windows systems,
bind `w32-collate-ignore-punctuation' to a non-nil value, since
the codeset part of the locale cannot be \"UTF-8\" on MS-Windows.
If your system does not support a locale environment, this function
behaves like `string-equal'. */)
behaves like `string-equal'.
Do NOT use this function to compare file names for equality, only
for sorting them. */)
(Lisp_Object s1, Lisp_Object s2, Lisp_Object locale, Lisp_Object ignore_case)
{
#if defined __STDC_ISO_10646__ || defined WINDOWSNT

View file

@ -3796,6 +3796,6 @@ str_collate (Lisp_Object s1, Lisp_Object s2,
char *loc = STRINGP (locale) ? SSDATA (locale) : NULL;
return w32_compare_strings (SDATA (s1), SDATA (s2), loc);
return w32_compare_strings (SDATA (s1), SDATA (s2), loc, !NILP (ignore_case));
}
#endif /* WINDOWSNT */

View file

@ -211,7 +211,7 @@ extern int w32_memory_info (unsigned long long *, unsigned long long *,
unsigned long long *, unsigned long long *);
/* Compare 2 UTF-8 strings in locale-dependent fashion. */
extern int w32_compare_strings (const char *, const char *, char *);
extern int w32_compare_strings (const char *, const char *, char *, int);
#ifdef HAVE_GNUTLS
#include <gnutls/gnutls.h>

View file

@ -3213,15 +3213,20 @@ get_lcid (const char *locale_name)
#ifndef _NSLCMPERROR
# define _NSLCMPERROR INT_MAX
#endif
#ifndef LINGUISTIC_IGNORECASE
# define LINGUISTIC_IGNORECASE 0x00000010
#endif
int
w32_compare_strings (const char *s1, const char *s2, char *locname)
w32_compare_strings (const char *s1, const char *s2, char *locname,
int ignore_case)
{
LCID lcid = GetThreadLocale ();
wchar_t *string1_w, *string2_w;
int val, needed;
extern BOOL g_b_init_compare_string_w;
static int (WINAPI *pCompareStringW)(LCID, DWORD, LPCWSTR, int, LPCWSTR, int);
DWORD flags = 0;
USE_SAFE_ALLOCA;
@ -3284,11 +3289,22 @@ w32_compare_strings (const char *s1, const char *s2, char *locname)
lcid = new_lcid;
}
/* FIXME: Need a way to control the FLAGS argument, perhaps via the
CODESET part of LOCNAME. In particular, ls-lisp will want
NORM_IGNORESYMBOLS and sometimes LINGUISTIC_IGNORECASE or
NORM_IGNORECASE. */
val = pCompareStringW (lcid, 0, string1_w, -1, string2_w, -1);
if (ignore_case)
{
/* NORM_IGNORECASE ignores any tertiary distinction, not just
case variants. LINGUISTIC_IGNORECASE is more selective, and
is sensitive to the locale's language, but it is not
available before Vista. */
if (w32_major_version >= 6)
flags |= LINGUISTIC_IGNORECASE;
else
flags |= NORM_IGNORECASE;
}
/* This approximates what glibc collation functions do when the
locale's codeset is UTF-8. */
if (!NILP (Vw32_collate_ignore_punctuation))
flags |= NORM_IGNORESYMBOLS;
val = pCompareStringW (lcid, flags, string1_w, -1, string2_w, -1);
SAFE_FREE ();
if (!val)
{
@ -3408,6 +3424,20 @@ Any other non-nil value means do this even on remote and removable drives
where the performance impact may be noticeable even on modern hardware. */);
Vw32_get_true_file_attributes = Qlocal;
DEFVAR_LISP ("w32-collate-ignore-punctuation",
Vw32_collate_ignore_punctuation,
doc: /* Non-nil causes string collation functions ignore punctuation on MS-Windows.
On Posix platforms, `string-collate-lessp' and `string-collate-equalp'
ignore punctuation characters when they compare strings, if the
locale's codeset is UTF-8, as in \"en_US.UTF-8\". Binding this option
to a non-nil value will achieve a similar effect on MS-Windows, where
locales with UTF-8 codeset are not supported.
Note that setting this to non-nil will also ignore blanks and symbols
in the strings. So do NOT use this option when comparing file names
for equality, only when you need to sort them. */);
Vw32_collate_ignore_punctuation = Qnil;
staticpro (&Vw32_valid_locale_ids);
staticpro (&Vw32_valid_codepages);
}