Fix [:upper:] and [:lower:] for Unicode characters (bug#11309)
* src/regex-emacs.c (execute_charset): Add canon_table argument to allow expression of a correct predicate for [:upper:] and [:lower:]. (mutually_exclusive_p, re_match_2_internal): Pass extra argument. * test/src/regex-emacs-tests.el (regexp-case-fold, regexp-eszett): New tests. Parts of regexp-eszett still fail and are commented out.
This commit is contained in:
parent
22caab8bac
commit
be4d6b043f
2 changed files with 66 additions and 8 deletions
|
@ -3575,9 +3575,11 @@ skip_noops (re_char *p, re_char *pend)
|
|||
opcode. When the function finishes, *PP will be advanced past that opcode.
|
||||
C is character to test (possibly after translations) and CORIG is original
|
||||
character (i.e. without any translations). UNIBYTE denotes whether c is
|
||||
unibyte or multibyte character. */
|
||||
unibyte or multibyte character.
|
||||
CANON_TABLE is the canonicalisation table for case folding or Qnil. */
|
||||
static bool
|
||||
execute_charset (re_char **pp, int c, int corig, bool unibyte)
|
||||
execute_charset (re_char **pp, int c, int corig, bool unibyte,
|
||||
Lisp_Object canon_table)
|
||||
{
|
||||
eassume (0 <= c && 0 <= corig);
|
||||
re_char *p = *pp, *rtp = NULL;
|
||||
|
@ -3617,11 +3619,9 @@ execute_charset (re_char **pp, int c, int corig, bool unibyte)
|
|||
(class_bits & BIT_BLANK && ISBLANK (c)) ||
|
||||
(class_bits & BIT_WORD && ISWORD (c)) ||
|
||||
((class_bits & BIT_UPPER) &&
|
||||
(ISUPPER (c) || (corig != c &&
|
||||
c == downcase (corig) && ISLOWER (c)))) ||
|
||||
(ISUPPER (corig) || (!NILP (canon_table) && ISLOWER (corig)))) ||
|
||||
((class_bits & BIT_LOWER) &&
|
||||
(ISLOWER (c) || (corig != c &&
|
||||
c == upcase (corig) && ISUPPER(c)))) ||
|
||||
(ISLOWER (corig) || (!NILP (canon_table) && ISUPPER (corig)))) ||
|
||||
(class_bits & BIT_PUNCT && ISPUNCT (c)) ||
|
||||
(class_bits & BIT_GRAPH && ISGRAPH (c)) ||
|
||||
(class_bits & BIT_PRINT && ISPRINT (c)))
|
||||
|
@ -3696,7 +3696,8 @@ mutually_exclusive_p (struct re_pattern_buffer *bufp, re_char *p1,
|
|||
else if ((re_opcode_t) *p1 == charset
|
||||
|| (re_opcode_t) *p1 == charset_not)
|
||||
{
|
||||
if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c)))
|
||||
if (!execute_charset (&p1, c, c, !multibyte || ASCII_CHAR_P (c),
|
||||
Qnil))
|
||||
{
|
||||
DEBUG_PRINT (" No match => fast loop.\n");
|
||||
return true;
|
||||
|
@ -4367,7 +4368,7 @@ re_match_2_internal (struct re_pattern_buffer *bufp,
|
|||
}
|
||||
|
||||
p -= 1;
|
||||
if (!execute_charset (&p, c, corig, unibyte_char))
|
||||
if (!execute_charset (&p, c, corig, unibyte_char, translate))
|
||||
goto fail;
|
||||
|
||||
d += len;
|
||||
|
|
|
@ -803,4 +803,61 @@ This evaluates the TESTS test cases from glibc."
|
|||
(should-not (string-match "å" "\xe5"))
|
||||
(should-not (string-match "[å]" "\xe5")))
|
||||
|
||||
(ert-deftest regexp-case-fold ()
|
||||
"Test case-sensitive and case-insensitive matching."
|
||||
(let ((case-fold-search nil))
|
||||
(should (equal (string-match "aB" "ABaB") 2))
|
||||
(should (equal (string-match "åÄ" "ÅäåäÅÄåÄ") 6))
|
||||
(should (equal (string-match "λΛ" "lΛλλΛ") 3))
|
||||
(should (equal (string-match "шШ" "zШшшШ") 3))
|
||||
(should (equal (string-match "[[:alpha:]]+" ".3aBåÄßλΛшШ中﷽") 2))
|
||||
(should (equal (match-end 0) 12))
|
||||
(should (equal (string-match "[[:alnum:]]+" ".3aBåÄßλΛшШ中﷽") 1))
|
||||
(should (equal (match-end 0) 12))
|
||||
(should (equal (string-match "[[:upper:]]+" ".3aåλшBÄΛШ中﷽") 6))
|
||||
(should (equal (match-end 0) 10))
|
||||
(should (equal (string-match "[[:lower:]]+" ".3BÄΛШaåλш中﷽") 6))
|
||||
(should (equal (match-end 0) 10)))
|
||||
(let ((case-fold-search t))
|
||||
(should (equal (string-match "aB" "ABaB") 0))
|
||||
(should (equal (string-match "åÄ" "ÅäåäÅÄåÄ") 0))
|
||||
(should (equal (string-match "λΛ" "lΛλλΛ") 1))
|
||||
(should (equal (string-match "шШ" "zШшшШ") 1))
|
||||
(should (equal (string-match "[[:alpha:]]+" ".3aBåÄßλΛшШ中﷽") 2))
|
||||
(should (equal (match-end 0) 12))
|
||||
(should (equal (string-match "[[:alnum:]]+" ".3aBåÄßλΛшШ中﷽") 1))
|
||||
(should (equal (match-end 0) 12))
|
||||
(should (equal (string-match "[[:upper:]]+" ".3aåλшBÄΛШ中﷽") 2))
|
||||
(should (equal (match-end 0) 10))
|
||||
(should (equal (string-match "[[:lower:]]+" ".3BÄΛШaåλш中﷽") 2))
|
||||
(should (equal (match-end 0) 10))))
|
||||
|
||||
(ert-deftest regexp-eszett ()
|
||||
"Test matching of ß and ẞ."
|
||||
;; ß is a lower-case letter (Ll); ẞ is an upper-case letter (Lu).
|
||||
(let ((case-fold-search nil))
|
||||
(should (equal (string-match "ß" "ß") 0))
|
||||
(should (equal (string-match "ß" "ẞ") nil))
|
||||
(should (equal (string-match "ẞ" "ß") nil))
|
||||
(should (equal (string-match "ẞ" "ẞ") 0))
|
||||
(should (equal (string-match "[[:alpha:]]" "ß") 0))
|
||||
;; bug#11309
|
||||
;;(should (equal (string-match "[[:lower:]]" "ß") 0))
|
||||
;;(should (equal (string-match "[[:upper:]]" "ß") nil))
|
||||
(should (equal (string-match "[[:alpha:]]" "ẞ") 0))
|
||||
(should (equal (string-match "[[:lower:]]" "ẞ") nil))
|
||||
(should (equal (string-match "[[:upper:]]" "ẞ") 0)))
|
||||
(let ((case-fold-search t))
|
||||
(should (equal (string-match "ß" "ß") 0))
|
||||
(should (equal (string-match "ß" "ẞ") 0))
|
||||
(should (equal (string-match "ẞ" "ß") 0))
|
||||
(should (equal (string-match "ẞ" "ẞ") 0))
|
||||
(should (equal (string-match "[[:alpha:]]" "ß") 0))
|
||||
;; bug#11309
|
||||
;;(should (equal (string-match "[[:lower:]]" "ß") 0))
|
||||
;;(should (equal (string-match "[[:upper:]]" "ß") 0))
|
||||
(should (equal (string-match "[[:alpha:]]" "ẞ") 0))
|
||||
(should (equal (string-match "[[:lower:]]" "ẞ") 0))
|
||||
(should (equal (string-match "[[:upper:]]" "ẞ") 0))))
|
||||
|
||||
;;; regex-emacs-tests.el ends here
|
||||
|
|
Loading…
Add table
Reference in a new issue