diff --git a/src/regex.c b/src/regex.c index 1f2a1f086de..3a25835f452 100644 --- a/src/regex.c +++ b/src/regex.c @@ -1969,29 +1969,96 @@ struct range_table_work_area #if ! WIDE_CHAR_SUPPORT -/* Map a string to the char class it names (if any). */ +/* Parse a character class, i.e. string such as "[:name:]". *strp + points to the string to be parsed and limit is length, in bytes, of + that string. + + If *strp point to a string that begins with "[:name:]", where name is + a non-empty sequence of lower case letters, *strp will be advanced past the + closing square bracket and RECC_* constant which maps to the name will be + returned. If name is not a valid character class name zero, or RECC_ERROR, + is returned. + + Otherwise, if *strp doesn’t begin with "[:name:]", -1 is returned. + + The function can be used on ASCII and multibyte (UTF-8-encoded) strings. + */ re_wctype_t -re_wctype (const_re_char *str) +re_wctype_parse (const unsigned char **strp, unsigned limit) { - const char *string = (const char *) str; - if (STREQ (string, "alnum")) return RECC_ALNUM; - else if (STREQ (string, "alpha")) return RECC_ALPHA; - else if (STREQ (string, "word")) return RECC_WORD; - else if (STREQ (string, "ascii")) return RECC_ASCII; - else if (STREQ (string, "nonascii")) return RECC_NONASCII; - else if (STREQ (string, "graph")) return RECC_GRAPH; - else if (STREQ (string, "lower")) return RECC_LOWER; - else if (STREQ (string, "print")) return RECC_PRINT; - else if (STREQ (string, "punct")) return RECC_PUNCT; - else if (STREQ (string, "space")) return RECC_SPACE; - else if (STREQ (string, "upper")) return RECC_UPPER; - else if (STREQ (string, "unibyte")) return RECC_UNIBYTE; - else if (STREQ (string, "multibyte")) return RECC_MULTIBYTE; - else if (STREQ (string, "digit")) return RECC_DIGIT; - else if (STREQ (string, "xdigit")) return RECC_XDIGIT; - else if (STREQ (string, "cntrl")) return RECC_CNTRL; - else if (STREQ (string, "blank")) return RECC_BLANK; - else return 0; + const char *beg = (const char *)*strp, *it; + + if (limit < 4 || beg[0] != '[' || beg[1] != ':') + return -1; + + beg += 2; /* skip opening ‘[:’ */ + limit -= 3; /* opening ‘[:’ and half of closing ‘:]’; --limit handles rest */ + for (it = beg; it[0] != ':' || it[1] != ']'; ++it) + if (!--limit) + return -1; + + *strp = (const unsigned char *)(it + 2); + + /* Sort tests in the length=five case by frequency the classes to minimise + number of times we fail the comparison. The frequencies of character class + names used in Emacs sources as of 2016-07-27: + + $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + | + sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr + 213 [:alnum:] + 104 [:alpha:] + 62 [:space:] + 39 [:digit:] + 36 [:blank:] + 26 [:word:] + 26 [:upper:] + 21 [:lower:] + 10 [:xdigit:] + 10 [:punct:] + 10 [:ascii:] + 4 [:nonascii:] + 4 [:graph:] + 2 [:print:] + 2 [:cntrl:] + 1 [:ff:] + + If you update this list, consider also updating chain of or’ed conditions + in execute_charset function. + */ + + switch (it - beg) { + case 4: + if (!memcmp (beg, "word", 4)) return RECC_WORD; + break; + case 5: + if (!memcmp (beg, "alnum", 5)) return RECC_ALNUM; + if (!memcmp (beg, "alpha", 5)) return RECC_ALPHA; + if (!memcmp (beg, "space", 5)) return RECC_SPACE; + if (!memcmp (beg, "digit", 5)) return RECC_DIGIT; + if (!memcmp (beg, "blank", 5)) return RECC_BLANK; + if (!memcmp (beg, "upper", 5)) return RECC_UPPER; + if (!memcmp (beg, "lower", 5)) return RECC_LOWER; + if (!memcmp (beg, "punct", 5)) return RECC_PUNCT; + if (!memcmp (beg, "ascii", 5)) return RECC_ASCII; + if (!memcmp (beg, "graph", 5)) return RECC_GRAPH; + if (!memcmp (beg, "print", 5)) return RECC_PRINT; + if (!memcmp (beg, "cntrl", 5)) return RECC_CNTRL; + break; + case 6: + if (!memcmp (beg, "xdigit", 6)) return RECC_XDIGIT; + break; + case 7: + if (!memcmp (beg, "unibyte", 7)) return RECC_UNIBYTE; + break; + case 8: + if (!memcmp (beg, "nonascii", 8)) return RECC_NONASCII; + break; + case 9: + if (!memcmp (beg, "multibyte", 9)) return RECC_MULTIBYTE; + break; + } + + return RECC_ERROR; } /* True if CH is in the char class CC. */ @@ -2776,10 +2843,74 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, { boolean escaped_char = false; const unsigned char *p2 = p; + re_wctype_t cc; re_wchar_t ch; if (p == pend) FREE_STACK_RETURN (REG_EBRACK); + /* See if we're at the beginning of a possible character + class. */ + if (syntax & RE_CHAR_CLASSES && + (cc = re_wctype_parse(&p, pend - p)) != -1) + { + if (cc == 0) + FREE_STACK_RETURN (REG_ECTYPE); + + if (p == pend) + FREE_STACK_RETURN (REG_EBRACK); + +#ifndef emacs + for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) + if (re_iswctype (btowc (ch), cc)) + { + c = TRANSLATE (ch); + if (c < (1 << BYTEWIDTH)) + SET_LIST_BIT (c); + } +#else /* emacs */ + /* Most character classes in a multibyte match just set + a flag. Exceptions are is_blank, is_digit, is_cntrl, and + is_xdigit, since they can only match ASCII characters. + We don't need to handle them for multibyte. They are + distinguished by a negative wctype. */ + + /* Setup the gl_state object to its buffer-defined value. + This hardcodes the buffer-global syntax-table for ASCII + chars, while the other chars will obey syntax-table + properties. It's not ideal, but it's the way it's been + done until now. */ + SETUP_BUFFER_SYNTAX_TABLE (); + + for (ch = 0; ch < 256; ++ch) + { + c = RE_CHAR_TO_MULTIBYTE (ch); + if (! CHAR_BYTE8_P (c) + && re_iswctype (c, cc)) + { + SET_LIST_BIT (ch); + c1 = TRANSLATE (c); + if (c1 == c) + continue; + if (ASCII_CHAR_P (c1)) + SET_LIST_BIT (c1); + else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) + SET_LIST_BIT (c1); + } + } + SET_RANGE_TABLE_WORK_AREA_BIT + (range_table_work, re_wctype_to_bit (cc)); +#endif /* emacs */ + /* In most cases the matching rule for char classes only + uses the syntax table for multibyte chars, so that the + content of the syntax-table is not hardcoded in the + range_table. SPACE and WORD are the two exceptions. */ + if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) + bufp->used_syntax = 1; + + /* Repeat the loop. */ + continue; + } + /* Don't translate yet. The range TRANSLATE(X..Y) cannot always be determined from TRANSLATE(X) and TRANSLATE(Y) So the translation is done later in a loop. Example: @@ -2803,119 +2934,6 @@ regex_compile (const_re_char *pattern, size_t size, reg_syntax_t syntax, break; } - /* See if we're at the beginning of a possible character - class. */ - - if (!escaped_char && - syntax & RE_CHAR_CLASSES && c == '[' && *p == ':') - { - /* Leave room for the null. */ - unsigned char str[CHAR_CLASS_MAX_LENGTH + 1]; - const unsigned char *class_beg; - - PATFETCH (c); - c1 = 0; - class_beg = p; - - /* If pattern is `[[:'. */ - if (p == pend) FREE_STACK_RETURN (REG_EBRACK); - - for (;;) - { - PATFETCH (c); - if ((c == ':' && *p == ']') || p == pend) - break; - if (c1 < CHAR_CLASS_MAX_LENGTH) - str[c1++] = c; - else - /* This is in any case an invalid class name. */ - str[0] = '\0'; - } - str[c1] = '\0'; - - /* If isn't a word bracketed by `[:' and `:]': - undo the ending character, the letters, and - leave the leading `:' and `[' (but set bits for - them). */ - if (c == ':' && *p == ']') - { - re_wctype_t cc = re_wctype (str); - - if (cc == 0) - FREE_STACK_RETURN (REG_ECTYPE); - - /* Throw away the ] at the end of the character - class. */ - PATFETCH (c); - - if (p == pend) FREE_STACK_RETURN (REG_EBRACK); - -#ifndef emacs - for (ch = 0; ch < (1 << BYTEWIDTH); ++ch) - if (re_iswctype (btowc (ch), cc)) - { - c = TRANSLATE (ch); - if (c < (1 << BYTEWIDTH)) - SET_LIST_BIT (c); - } -#else /* emacs */ - /* Most character classes in a multibyte match - just set a flag. Exceptions are is_blank, - is_digit, is_cntrl, and is_xdigit, since - they can only match ASCII characters. We - don't need to handle them for multibyte. - They are distinguished by a negative wctype. */ - - /* Setup the gl_state object to its buffer-defined - value. This hardcodes the buffer-global - syntax-table for ASCII chars, while the other chars - will obey syntax-table properties. It's not ideal, - but it's the way it's been done until now. */ - SETUP_BUFFER_SYNTAX_TABLE (); - - for (ch = 0; ch < 256; ++ch) - { - c = RE_CHAR_TO_MULTIBYTE (ch); - if (! CHAR_BYTE8_P (c) - && re_iswctype (c, cc)) - { - SET_LIST_BIT (ch); - c1 = TRANSLATE (c); - if (c1 == c) - continue; - if (ASCII_CHAR_P (c1)) - SET_LIST_BIT (c1); - else if ((c1 = RE_CHAR_TO_UNIBYTE (c1)) >= 0) - SET_LIST_BIT (c1); - } - } - SET_RANGE_TABLE_WORK_AREA_BIT - (range_table_work, re_wctype_to_bit (cc)); -#endif /* emacs */ - /* In most cases the matching rule for char classes - only uses the syntax table for multibyte chars, - so that the content of the syntax-table is not - hardcoded in the range_table. SPACE and WORD are - the two exceptions. */ - if ((1 << cc) & ((1 << RECC_SPACE) | (1 << RECC_WORD))) - bufp->used_syntax = 1; - - /* Repeat the loop. */ - continue; - } - else - { - /* Go back to right after the "[:". */ - p = class_beg; - SET_LIST_BIT ('['); - - /* Because the `:' may start the range, we - can't simply set bit and repeat the loop. - Instead, just set it to C and handle below. */ - c = ':'; - } - } - if (p < pend && p[0] == '-' && p[1] != ']') { @@ -4659,28 +4677,8 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte) re_wchar_t range_start, range_end; /* Sort tests by the most commonly used classes with some adjustment to which - tests are easiest to perform. Frequencies of character class names used in - Emacs sources as of 2016-07-15: - - $ find \( -name \*.c -o -name \*.el \) -exec grep -h '\[:[a-z]*:]' {} + | - sed 's/]/]\n/g' |grep -o '\[:[a-z]*:]' |sort |uniq -c |sort -nr - 213 [:alnum:] - 104 [:alpha:] - 62 [:space:] - 39 [:digit:] - 36 [:blank:] - 26 [:upper:] - 24 [:word:] - 21 [:lower:] - 10 [:punct:] - 10 [:ascii:] - 9 [:xdigit:] - 4 [:nonascii:] - 4 [:graph:] - 2 [:print:] - 2 [:cntrl:] - 1 [:ff:] - */ + tests are easiest to perform. Take a look at comment in re_wctype_parse + for table with frequencies of character class names. */ if ((class_bits & BIT_MULTIBYTE) || (class_bits & BIT_ALNUM && ISALNUM (c)) || diff --git a/src/regex.h b/src/regex.h index 817167a07ca..01b659addbb 100644 --- a/src/regex.h +++ b/src/regex.h @@ -585,25 +585,13 @@ extern void regfree (regex_t *__preg); /* Solaris 2.5 has a bug: must be included before . */ # include # include -#endif -#if WIDE_CHAR_SUPPORT -/* The GNU C library provides support for user-defined character classes - and the functions from ISO C amendment 1. */ -# ifdef CHARCLASS_NAME_MAX -# define CHAR_CLASS_MAX_LENGTH CHARCLASS_NAME_MAX -# else -/* This shouldn't happen but some implementation might still have this - problem. Use a reasonable default value. */ -# define CHAR_CLASS_MAX_LENGTH 256 -# endif typedef wctype_t re_wctype_t; typedef wchar_t re_wchar_t; # define re_wctype wctype # define re_iswctype iswctype # define re_wctype_to_bit(cc) 0 #else -# define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */ # ifndef emacs # define btowc(c) c # endif @@ -621,7 +609,7 @@ typedef enum { RECC_ERROR = 0, } re_wctype_t; extern char re_iswctype (int ch, re_wctype_t cc); -extern re_wctype_t re_wctype (const unsigned char* str); +extern re_wctype_t re_wctype_parse (const unsigned char **strp, unsigned limit); typedef int re_wchar_t; diff --git a/src/syntax.c b/src/syntax.c index f8d987b377c..667de402ec4 100644 --- a/src/syntax.c +++ b/src/syntax.c @@ -1691,44 +1691,22 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim, /* At first setup fastmap. */ while (i_byte < size_byte) { - c = str[i_byte++]; - - if (handle_iso_classes && c == '[' - && i_byte < size_byte - && str[i_byte] == ':') + if (handle_iso_classes) { - const unsigned char *class_beg = str + i_byte + 1; - const unsigned char *class_end = class_beg; - const unsigned char *class_limit = str + size_byte - 2; - /* Leave room for the null. */ - unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1]; - re_wctype_t cc; - - if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH) - class_limit = class_beg + CHAR_CLASS_MAX_LENGTH; - - while (class_end < class_limit - && *class_end >= 'a' && *class_end <= 'z') - class_end++; - - if (class_end == class_beg - || *class_end != ':' || class_end[1] != ']') - goto not_a_class_name; - - memcpy (class_name, class_beg, class_end - class_beg); - class_name[class_end - class_beg] = 0; - - cc = re_wctype (class_name); + const unsigned char *ch = str + i_byte; + re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte); if (cc == 0) error ("Invalid ISO C character class"); - - iso_classes = Fcons (make_number (cc), iso_classes); - - i_byte = class_end + 2 - str; - continue; + if (cc != -1) + { + iso_classes = Fcons (make_number (cc), iso_classes); + i_byte = ch - str; + continue; + } } - not_a_class_name: + c = str[i_byte++]; + if (c == '\\') { if (i_byte == size_byte) @@ -1808,54 +1786,32 @@ skip_chars (bool forwardp, Lisp_Object string, Lisp_Object lim, while (i_byte < size_byte) { int leading_code = str[i_byte]; - c = STRING_CHAR_AND_LENGTH (str + i_byte, len); - i_byte += len; - if (handle_iso_classes && c == '[' - && i_byte < size_byte - && STRING_CHAR (str + i_byte) == ':') + if (handle_iso_classes) { - const unsigned char *class_beg = str + i_byte + 1; - const unsigned char *class_end = class_beg; - const unsigned char *class_limit = str + size_byte - 2; - /* Leave room for the null. */ - unsigned char class_name[CHAR_CLASS_MAX_LENGTH + 1]; - re_wctype_t cc; - - if (class_limit - class_beg > CHAR_CLASS_MAX_LENGTH) - class_limit = class_beg + CHAR_CLASS_MAX_LENGTH; - - while (class_end < class_limit - && *class_end >= 'a' && *class_end <= 'z') - class_end++; - - if (class_end == class_beg - || *class_end != ':' || class_end[1] != ']') - goto not_a_class_name_multibyte; - - memcpy (class_name, class_beg, class_end - class_beg); - class_name[class_end - class_beg] = 0; - - cc = re_wctype (class_name); + const unsigned char *ch = str + i_byte; + re_wctype_t cc = re_wctype_parse (&ch, size_byte - i_byte); if (cc == 0) error ("Invalid ISO C character class"); - - iso_classes = Fcons (make_number (cc), iso_classes); - - i_byte = class_end + 2 - str; - continue; + if (cc != -1) + { + iso_classes = Fcons (make_number (cc), iso_classes); + i_byte = ch - str; + continue; + } } - not_a_class_name_multibyte: - if (c == '\\') + if (leading_code== '\\') { - if (i_byte == size_byte) + if (++i_byte == size_byte) break; leading_code = str[i_byte]; - c = STRING_CHAR_AND_LENGTH (str + i_byte, len); - i_byte += len; } + c = STRING_CHAR_AND_LENGTH (str + i_byte, len); + i_byte += len; + + /* Treat `-' as range character only if another character follows. */ if (i_byte + 1 < size_byte