Add support for Unicode whitespace in [:blank:]
See Bug#25366. * src/character.c (blankp): New function for checking Unicode horizontal whitespace. * src/regex.c (ISBLANK): Use 'blankp' for non-ASCII horizontal whitespace. (BIT_BLANK): New bit for range table. (re_wctype_to_bit, execute_charset): Use it. * test/lisp/subr-tests.el (subr-tests--string-match-p--blank): Add unit test for [:blank:] character class. * test/src/regex-tests.el (test): Adapt unit test. * doc/lispref/searching.texi (Char Classes): Document new Unicode behavior for [:blank:].
This commit is contained in:
parent
8f0376309e
commit
512e9886be
7 changed files with 48 additions and 6 deletions
|
@ -553,7 +553,11 @@ characters whose Unicode @samp{general-category} property
|
|||
(@pxref{Character Properties}) indicates they are alphabetic
|
||||
characters.
|
||||
@item [:blank:]
|
||||
This matches space and tab only.
|
||||
This matches horizontal whitespace, as defined by Annex C of the
|
||||
Unicode Technical Standard #18. In particular, it matches spaces,
|
||||
tabs, and other characters whose Unicode @samp{general-category}
|
||||
property (@pxref{Character Properties}) indicates they are spacing
|
||||
separators.
|
||||
@item [:cntrl:]
|
||||
This matches any @acronym{ASCII} control character.
|
||||
@item [:digit:]
|
||||
|
|
6
etc/NEWS
6
etc/NEWS
|
@ -710,6 +710,12 @@ of curved quotes in format arguments to functions like 'message' and
|
|||
now generate less chatter and more-compact diagnostics. The auxiliary
|
||||
function 'check-declare-errmsg' has been removed.
|
||||
|
||||
+++
|
||||
** The regular expression character class [:blank:] now matches
|
||||
Unicode horizontal whitespace as defined in the Unicode Technical
|
||||
Standard #18. If you only want to match space and tab, use [ \t]
|
||||
instead.
|
||||
|
||||
|
||||
* Lisp Changes in Emacs 26.1
|
||||
|
||||
|
|
|
@ -1038,6 +1038,23 @@ printablep (int c)
|
|||
|| gen_cat == UNICODE_CATEGORY_Cn)); /* unassigned */
|
||||
}
|
||||
|
||||
/* Return true if C is a horizontal whitespace character, as defined
|
||||
by http://www.unicode.org/reports/tr18/tr18-19.html#blank. */
|
||||
bool
|
||||
blankp (int c)
|
||||
{
|
||||
/* Fast path for ASCII characters that are always assumed to
|
||||
constitute horizontal whitespace. */
|
||||
if (c == ' ' || c == '\t')
|
||||
return true;
|
||||
|
||||
Lisp_Object category = CHAR_TABLE_REF (Vunicode_category_table, c);
|
||||
if (! INTEGERP (category))
|
||||
return false;
|
||||
|
||||
return XINT (category) == UNICODE_CATEGORY_Zs; /* separator, space */
|
||||
}
|
||||
|
||||
void
|
||||
syms_of_character (void)
|
||||
{
|
||||
|
|
|
@ -680,6 +680,7 @@ extern bool alphabeticp (int);
|
|||
extern bool alphanumericp (int);
|
||||
extern bool graphicp (int);
|
||||
extern bool printablep (int);
|
||||
extern bool blankp (int);
|
||||
|
||||
/* Return a translation table of id number ID. */
|
||||
#define GET_TRANSLATION_TABLE(id) \
|
||||
|
|
12
src/regex.c
12
src/regex.c
|
@ -310,11 +310,12 @@ enum syntaxcode { Swhitespace = 0, Sword = 1, Ssymbol = 2 };
|
|||
|| ((c) >= 'a' && (c) <= 'f') \
|
||||
|| ((c) >= 'A' && (c) <= 'F'))
|
||||
|
||||
/* This is only used for single-byte characters. */
|
||||
# define ISBLANK(c) ((c) == ' ' || (c) == '\t')
|
||||
|
||||
/* The rest must handle multibyte characters. */
|
||||
|
||||
# define ISBLANK(c) (IS_REAL_ASCII (c) \
|
||||
? ((c) == ' ' || (c) == '\t') \
|
||||
: blankp (c))
|
||||
|
||||
# define ISGRAPH(c) (SINGLE_BYTE_CHAR_P (c) \
|
||||
? (c) > ' ' && !((c) >= 0177 && (c) <= 0240) \
|
||||
: graphicp (c))
|
||||
|
@ -1790,6 +1791,7 @@ struct range_table_work_area
|
|||
#define BIT_ALNUM 0x80
|
||||
#define BIT_GRAPH 0x100
|
||||
#define BIT_PRINT 0x200
|
||||
#define BIT_BLANK 0x400
|
||||
|
||||
|
||||
/* Set the bit for character C in a list. */
|
||||
|
@ -2066,8 +2068,9 @@ re_wctype_to_bit (re_wctype_t cc)
|
|||
case RECC_SPACE: return BIT_SPACE;
|
||||
case RECC_GRAPH: return BIT_GRAPH;
|
||||
case RECC_PRINT: return BIT_PRINT;
|
||||
case RECC_BLANK: return BIT_BLANK;
|
||||
case RECC_ASCII: case RECC_DIGIT: case RECC_XDIGIT: case RECC_CNTRL:
|
||||
case RECC_BLANK: case RECC_UNIBYTE: case RECC_ERROR: return 0;
|
||||
case RECC_UNIBYTE: case RECC_ERROR: return 0;
|
||||
default:
|
||||
abort ();
|
||||
}
|
||||
|
@ -4658,6 +4661,7 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
|
|||
(class_bits & BIT_ALNUM && ISALNUM (c)) ||
|
||||
(class_bits & BIT_ALPHA && ISALPHA (c)) ||
|
||||
(class_bits & BIT_SPACE && ISSPACE (c)) ||
|
||||
(class_bits & BIT_BLANK && ISBLANK (c)) ||
|
||||
(class_bits & BIT_WORD && ISWORD (c)) ||
|
||||
((class_bits & BIT_UPPER) &&
|
||||
(ISUPPER (c) || (corig != c &&
|
||||
|
|
|
@ -271,5 +271,15 @@ indirectly `mapbacktrace'."
|
|||
(let ((frame-lists (subr-test--frames-1 'subr-test--frames-2)))
|
||||
(should (equal (car frame-lists) (cdr frame-lists)))))
|
||||
|
||||
(ert-deftest subr-tests--string-match-p--blank ()
|
||||
"Test that [:blank:] matches horizontal whitespace, cf. Bug#25366."
|
||||
(should (equal (string-match-p "\\`[[:blank:]]\\'" " ") 0))
|
||||
(should (equal (string-match-p "\\`[[:blank:]]\\'" "\t") 0))
|
||||
(should-not (string-match-p "\\`[[:blank:]]\\'" "\n"))
|
||||
(should-not (string-match-p "\\`[[:blank:]]\\'" "a"))
|
||||
(should (equal (string-match-p "\\`[[:blank:]]\\'" "\N{HAIR SPACE}") 0))
|
||||
(should (equal (string-match-p "\\`[[:blank:]]\\'" "\u3000") 0))
|
||||
(should-not (string-match-p "\\`[[:blank:]]\\'" "\N{LINE SEPARATOR}")))
|
||||
|
||||
(provide 'subr-tests)
|
||||
;;; subr-tests.el ends here
|
||||
|
|
|
@ -80,7 +80,7 @@ character) must match a string \"\u2420\"."
|
|||
("print" "abcłąka\u2620-, " "\t\n\1")
|
||||
|
||||
("space" " \t\n\u2001" "abcABCł0123")
|
||||
("blank" " \t" "\n\u2001")
|
||||
("blank" " \t\u2001" "\n")
|
||||
|
||||
("ascii" "abcABC012 \t\n\1" "łą\u2620")
|
||||
("nonascii" "łą\u2622" "abcABC012 \t\n\1")
|
||||
|
|
Loading…
Add table
Reference in a new issue