Make [:print:] support non-ASCII characters correctly

* src/regex.c (ISPRINT): Call 'printablep' for multibyte characters.
(BIT_PRINT): New bit mask.
(re_wctype_to_bit): Return BIT_PRINT for RECC_PRINT.
* src/character.c (printablep): New function.
* src/character.h (printablep): Add prototype.

* lisp/emacs-lisp/rx.el (rx): Doc fix: document the new behavior
of 'print', 'alnum', and 'alphabetic'.

* doc/lispref/searching.texi (Char Classes): Document the new
behavior of [:print:].

* etc/NEWS: Mention the new behavior of [:print:].
This commit is contained in:
Eli Zaretskii 2015-04-14 18:47:04 +03:00
parent 8802474a21
commit 6c284c6b58
6 changed files with 42 additions and 11 deletions

View file

@ -569,8 +569,11 @@ This matches any multibyte character (@pxref{Text Representations}).
@item [:nonascii:]
This matches any non-@acronym{ASCII} character.
@item [:print:]
This matches printing characters---everything except @acronym{ASCII} control
characters and the delete character.
This matches printing characters---everything except @acronym{ASCII}
and non-@acronym{ASCII} control characters (including the delete
character), surrogates, and codepoints unassigned by Unicode, as
indicated by the Unicode @samp{general-category} property
(@pxref{Character Properties}).
@item [:punct:]
This matches any punctuation character. (At present, for multibyte
characters, it matches anything that has non-word syntax.)

View file

@ -628,6 +628,14 @@ notifications, if Emacs is compiled with file notification support.
---
*** gulp.el
+++
** The character class [:print:] in regular expressions
no longer matches any multibyte character. Instead, Emacs now
consults the Unicode character properties to determine which
characters are printable. In particular, surrogates and unassigned
codepoints are now rejected by this class. If you want the old
behavior, use [:multibyte:] instead.
* New Modes and Packages in Emacs 25.1

View file

@ -969,16 +969,16 @@ CHAR
space, and DEL.
`printing', `print'
matches printing characters--everything except ASCII control chars
and DEL.
matches printing characters--everything except ASCII and non-ASCII
control characters, surrogates, and codepoints unassigned by Unicode.
`alphanumeric', `alnum'
matches letters and digits. (But at present, for multibyte characters,
it matches anything that has word syntax.)
matches alphabetic characters and digits. (For multibyte characters,
it matches according to Unicode character properties.)
`letter', `alphabetic', `alpha'
matches letters. (But at present, for multibyte characters,
it matches anything that has word syntax.)
matches alphabetic characters. (For multibyte characters,
it matches according to Unicode character properties.)
`ascii'
matches ASCII (unibyte) characters.

View file

@ -1022,6 +1022,22 @@ decimalnump (int c)
return gen_cat == UNICODE_CATEGORY_Nd;
}
/* Return 'true' if C is a printable character as defined by its
Unicode properties. */
bool
printablep (int c)
{
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
if (! INTEGERP (category))
return false;
EMACS_INT gen_cat = XINT (category);
/* See UTS #18. */
return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
|| gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
|| gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
}
void
syms_of_character (void)
{

View file

@ -662,6 +662,7 @@ extern Lisp_Object string_escape_byte8 (Lisp_Object);
extern bool alphabeticp (int);
extern bool decimalnump (int);
extern bool printablep (int);
/* Return a translation table of id number ID. */
#define GET_TRANSLATION_TABLE(id) \

View file

@ -318,7 +318,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
: 1)
: printablep (c))
# define ISALNUM(c) (IS_REAL_ASCII (c) \
? (((c) >= 'a' && (c) <= 'z') \
@ -1865,7 +1865,8 @@ struct range_table_work_area
#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
/* Bits used to implement the multibyte-part of the various character classes
such as [:alnum:] in a charset's range table. */
such as [:alnum:] in a charset's range table. The code currently assumes
that only the low 16 bits are used. */
#define BIT_WORD 0x1
#define BIT_LOWER 0x2
#define BIT_PUNCT 0x4
@ -1874,6 +1875,7 @@ struct range_table_work_area
#define BIT_MULTIBYTE 0x20
#define BIT_ALPHA 0x40
#define BIT_ALNUM 0x80
#define BIT_PRINT 0x100
/* Set the bit for character C in a list. */
@ -2072,7 +2074,7 @@ re_wctype_to_bit (re_wctype_t cc)
{
switch (cc)
{
case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
case RECC_NONASCII: case RECC_GRAPH:
case RECC_MULTIBYTE: return BIT_MULTIBYTE;
case RECC_ALPHA: return BIT_ALPHA;
case RECC_ALNUM: return BIT_ALNUM;
@ -2081,6 +2083,7 @@ re_wctype_to_bit (re_wctype_t cc)
case RECC_UPPER: return BIT_UPPER;
case RECC_PUNCT: return BIT_PUNCT;
case RECC_SPACE: return BIT_SPACE;
case RECC_PRINT: return BIT_PRINT;
case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
default: