Make [:print:] support non-ASCII characters correctly
* src/regex.c (ISPRINT): Call 'printablep' for multibyte characters. (BIT_PRINT): New bit mask. (re_wctype_to_bit): Return BIT_PRINT for RECC_PRINT. * src/character.c (printablep): New function. * src/character.h (printablep): Add prototype. * lisp/emacs-lisp/rx.el (rx): Doc fix: document the new behavior of 'print', 'alnum', and 'alphabetic'. * doc/lispref/searching.texi (Char Classes): Document the new behavior of [:print:]. * etc/NEWS: Mention the new behavior of [:print:].
This commit is contained in:
parent
8802474a21
commit
6c284c6b58
6 changed files with 42 additions and 11 deletions
|
@ -569,8 +569,11 @@ This matches any multibyte character (@pxref{Text Representations}).
|
|||
@item [:nonascii:]
|
||||
This matches any non-@acronym{ASCII} character.
|
||||
@item [:print:]
|
||||
This matches printing characters---everything except @acronym{ASCII} control
|
||||
characters and the delete character.
|
||||
This matches printing characters---everything except @acronym{ASCII}
|
||||
and non-@acronym{ASCII} control characters (including the delete
|
||||
character), surrogates, and codepoints unassigned by Unicode, as
|
||||
indicated by the Unicode @samp{general-category} property
|
||||
(@pxref{Character Properties}).
|
||||
@item [:punct:]
|
||||
This matches any punctuation character. (At present, for multibyte
|
||||
characters, it matches anything that has non-word syntax.)
|
||||
|
|
8
etc/NEWS
8
etc/NEWS
|
@ -628,6 +628,14 @@ notifications, if Emacs is compiled with file notification support.
|
|||
---
|
||||
*** gulp.el
|
||||
|
||||
+++
|
||||
** The character class [:print:] in regular expressions
|
||||
no longer matches any multibyte character. Instead, Emacs now
|
||||
consults the Unicode character properties to determine which
|
||||
characters are printable. In particular, surrogates and unassigned
|
||||
codepoints are now rejected by this class. If you want the old
|
||||
behavior, use [:multibyte:] instead.
|
||||
|
||||
|
||||
* New Modes and Packages in Emacs 25.1
|
||||
|
||||
|
|
|
@ -969,16 +969,16 @@ CHAR
|
|||
space, and DEL.
|
||||
|
||||
`printing', `print'
|
||||
matches printing characters--everything except ASCII control chars
|
||||
and DEL.
|
||||
matches printing characters--everything except ASCII and non-ASCII
|
||||
control characters, surrogates, and codepoints unassigned by Unicode.
|
||||
|
||||
`alphanumeric', `alnum'
|
||||
matches letters and digits. (But at present, for multibyte characters,
|
||||
it matches anything that has word syntax.)
|
||||
matches alphabetic characters and digits. (For multibyte characters,
|
||||
it matches according to Unicode character properties.)
|
||||
|
||||
`letter', `alphabetic', `alpha'
|
||||
matches letters. (But at present, for multibyte characters,
|
||||
it matches anything that has word syntax.)
|
||||
matches alphabetic characters. (For multibyte characters,
|
||||
it matches according to Unicode character properties.)
|
||||
|
||||
`ascii'
|
||||
matches ASCII (unibyte) characters.
|
||||
|
|
|
@ -1022,6 +1022,22 @@ decimalnump (int c)
|
|||
return gen_cat == UNICODE_CATEGORY_Nd;
|
||||
}
|
||||
|
||||
/* Return 'true' if C is a printable character as defined by its
|
||||
Unicode properties. */
|
||||
bool
|
||||
printablep (int c)
|
||||
{
|
||||
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
|
||||
if (! INTEGERP (category))
|
||||
return false;
|
||||
EMACS_INT gen_cat = XINT (category);
|
||||
|
||||
/* See UTS #18. */
|
||||
return (!(gen_cat == UNICODE_CATEGORY_Cc /* control */
|
||||
|| gen_cat == UNICODE_CATEGORY_Cs /* surrogate */
|
||||
|| gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
|
||||
}
|
||||
|
||||
void
|
||||
syms_of_character (void)
|
||||
{
|
||||
|
|
|
@ -662,6 +662,7 @@ extern Lisp_Object string_escape_byte8 (Lisp_Object);
|
|||
|
||||
extern bool alphabeticp (int);
|
||||
extern bool decimalnump (int);
|
||||
extern bool printablep (int);
|
||||
|
||||
/* Return a translation table of id number ID. */
|
||||
#define GET_TRANSLATION_TABLE(id) \
|
||||
|
|
|
@ -318,7 +318,7 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
|
|||
|
||||
# define ISPRINT(c) (SINGLE_BYTE_CHAR_P (c) \
|
||||
? (c) >= ' ' && !((c) >= 0177 && (c) <= 0237) \
|
||||
: 1)
|
||||
: printablep (c))
|
||||
|
||||
# define ISALNUM(c) (IS_REAL_ASCII (c) \
|
||||
? (((c) >= 'a' && (c) <= 'z') \
|
||||
|
@ -1865,7 +1865,8 @@ struct range_table_work_area
|
|||
#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])
|
||||
|
||||
/* Bits used to implement the multibyte-part of the various character classes
|
||||
such as [:alnum:] in a charset's range table. */
|
||||
such as [:alnum:] in a charset's range table. The code currently assumes
|
||||
that only the low 16 bits are used. */
|
||||
#define BIT_WORD 0x1
|
||||
#define BIT_LOWER 0x2
|
||||
#define BIT_PUNCT 0x4
|
||||
|
@ -1874,6 +1875,7 @@ struct range_table_work_area
|
|||
#define BIT_MULTIBYTE 0x20
|
||||
#define BIT_ALPHA 0x40
|
||||
#define BIT_ALNUM 0x80
|
||||
#define BIT_PRINT 0x100
|
||||
|
||||
|
||||
/* Set the bit for character C in a list. */
|
||||
|
@ -2072,7 +2074,7 @@ re_wctype_to_bit (re_wctype_t cc)
|
|||
{
|
||||
switch (cc)
|
||||
{
|
||||
case RECC_NONASCII: case RECC_PRINT: case RECC_GRAPH:
|
||||
case RECC_NONASCII: case RECC_GRAPH:
|
||||
case RECC_MULTIBYTE: return BIT_MULTIBYTE;
|
||||
case RECC_ALPHA: return BIT_ALPHA;
|
||||
case RECC_ALNUM: return BIT_ALNUM;
|
||||
|
@ -2081,6 +2083,7 @@ re_wctype_to_bit (re_wctype_t cc)
|
|||
case RECC_UPPER: return BIT_UPPER;
|
||||
case RECC_PUNCT: return BIT_PUNCT;
|
||||
case RECC_SPACE: return BIT_SPACE;
|
||||
case RECC_PRINT: return BIT_PRINT;
|
||||
case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
|
||||
case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
|
||||
default:
|
||||
|
|
Loading…
Add table
Reference in a new issue