Fix regexp character class syntax property ghost matching bug

The syntax-table-dependent regexp character classes [:space:],
[:word:] and [:punct:] always use the buffer-local syntax table for
performance reasons.  Fix a bug that could cause ghost (mis)matches
from use of lingering state by constructs that do use syntax
properties, such as `\sX`.

* src/regex-emacs.c (BUFFER_SYNTAX): New macro.
(ISPUNCT, ISSPACE, ISWORD): Use BUFFER_SYNTAX instead of SYNTAX.
(regex_compile): Delete syntax table setup code that is no longer
needed.
* test/src/regex-emacs-tests.el (regex-emacs-syntax-properties):
New regression test.
This commit is contained in:
Mattias Engdegård 2023-07-22 17:26:11 +02:00
parent cfdce1a19f
commit 5d2d28458d
2 changed files with 28 additions and 12 deletions

View file

@ -47,6 +47,9 @@
/* Make syntax table lookup grant data in gl_state. */
#define SYNTAX(c) syntax_property (c, 1)
/* Explicit syntax lookup using the buffer-local table. */
#define BUFFER_SYNTAX(c) syntax_property (c, 0)
#define RE_MULTIBYTE_P(bufp) ((bufp)->multibyte)
#define RE_TARGET_MULTIBYTE_P(bufp) ((bufp)->target_multibyte)
#define RE_STRING_CHAR(p, multibyte) \
@ -132,18 +135,22 @@
#define ISLOWER(c) lowercasep (c)
#define ISUPPER(c) uppercasep (c)
/* The following predicates use the buffer-local syntax table and
ignore syntax properties, for consistency with the up-front
assumptions made at compile time. */
#define ISPUNCT(c) (IS_REAL_ASCII (c) \
? ((c) > ' ' && (c) < 0177 \
&& !(((c) >= 'a' && (c) <= 'z') \
|| ((c) >= 'A' && (c) <= 'Z') \
|| ((c) >= '0' && (c) <= '9'))) \
: SYNTAX (c) != Sword)
: BUFFER_SYNTAX (c) != Sword)
#define ISSPACE(c) (SYNTAX (c) == Swhitespace)
#define ISSPACE(c) (BUFFER_SYNTAX (c) == Swhitespace)
#define ISUPPER(c) uppercasep (c)
#define ISWORD(c) (SYNTAX (c) == Sword)
#define ISWORD(c) (BUFFER_SYNTAX (c) == Sword)
/* Use alloca instead of malloc. This is because using malloc in
re_search* or re_match* could cause memory leaks when C-g is used
@ -2048,13 +2055,6 @@ regex_compile (re_char *pattern, ptrdiff_t size,
is_xdigit, since they can only match ASCII characters.
We don't need to handle them for multibyte. */
/* Setup the gl_state object to its buffer-defined value.
This hardcodes the buffer-global syntax-table for ASCII
chars, while the other chars will obey syntax-table
properties. It's not ideal, but it's the way it's been
done until now. */
SETUP_BUFFER_SYNTAX_TABLE ();
for (c = 0; c < 0x80; ++c)
if (re_iswctype (c, cc))
{

View file

@ -949,4 +949,20 @@ This evaluates the TESTS test cases from glibc."
(should (equal (smatch "a\\=*b" "ab") 0))
))
(ert-deftest regex-emacs-syntax-properties ()
;; Verify absence of character class syntax property ghost matching bug.
(let ((re "\\s-[[:space:]]")
(s (concat "a"
(propertize "b" 'syntax-table '(0)) ; whitespace
"éz"))
(parse-sexp-lookup-properties t))
;; Test matching in a string...
(should (equal (string-match re s) nil))
;; ... and in a buffer.
(should (equal (with-temp-buffer
(insert s)
(goto-char (point-min))
(re-search-forward re nil t))
nil))))
;;; regex-emacs-tests.el ends here