Fix range handling so it works for multibyte buffer (bug#73204)

Here by multibyte buffer I mean buffer that includes non-ASCII
characters.

The problem is illustrated by this comment, which I copied from the
source:

======================================================================
(ref:bytepos-range-pitfall) Suppose we have the following buffer
content ([ ] is a unibyte char, [    ] is a multibyte char):

    [a][b][c][d][e][ f  ]

and the following ranges (denoted by braces):

    [a][b][c][d][e][ f  ]
    {       }{    }

So far so good, now user deletes a unibyte char at the beginning:

    [b][c][d][e][ f  ]
    {       }{    }

Oops, now our range cuts into the multibyte char, bad!
======================================================================

* src/treesit.c (treesit_debug_print_parser_list): Minor fix.
(treesit_sync_visible_region): Change the way we fixup ranges, instead
of using the bytepos ranges from tree-sitter, we use the cached lisp
charpos ranges.
(treesit_make_ts_ranges): New function.
(Ftreesit_parser_set_included_ranges): Refactor out the new function
treesit_make_ts_ranges.
(Ftreesit_parser_included_ranges): Rather than getting the ranges from
tree-sitter, just return the cached lisp ranges.

* src/treesit.h (Lisp_TS_Parser): Add some comment.
* test/src/treesit-tests.el (treesit-range-fixup-after-edit): New test.
This commit is contained in:
Yuan Fu 2024-09-13 21:42:17 -07:00
parent 76faf7e609
commit 6a6d7925c9
No known key found for this signature in database
GPG key ID: 56E19BC57664A442
3 changed files with 173 additions and 90 deletions

View file

@ -499,7 +499,7 @@ treesit_debug_print_parser_list (char *msg, Lisp_Object parser)
SSDATA (SYMBOL_NAME (Vthis_command)),
SSDATA (SYMBOL_NAME (XTS_PARSER (parser)->language_symbol)),
buf_name, BUF_BEG (buf),
BUF_BEGV (buf), BUF_Z (buf), BUF_ZV (buf));
BUF_BEGV (buf), BUF_ZV (buf), BUF_Z (buf));
Lisp_Object tail = BVAR (buf, ts_parser_list);
FOR_EACH_TAIL (tail)
@ -922,6 +922,9 @@ treesit_record_change (ptrdiff_t start_byte, ptrdiff_t old_end_byte,
}
}
static TSRange *treesit_make_ts_ranges (Lisp_Object, Lisp_Object,
uint32_t *);
/* Comment (ref:visible-beg-null) The purpose of visible_beg/end is to
keep track of "which part of the buffer does the tree-sitter tree
see", in order to update the tree correctly. Visible_beg/end have
@ -1050,48 +1053,85 @@ treesit_sync_visible_region (Lisp_Object parser)
XTS_PARSER (parser)->visible_end = visible_end;
/* Fix ranges so that the ranges stays with in visible_end. Here we
try to do minimal work so that the ranges is minimally correct such
that there's no OOB error. Usually treesit-update-ranges should
update the parser with actually correct ranges. */
if (NILP (XTS_PARSER (parser)->last_set_ranges)) return;
uint32_t len;
const TSRange *ranges
= ts_parser_included_ranges (XTS_PARSER (parser)->parser, &len);
/* We might need to discard some ranges that exceeds visible_end, in
that case, new_len is the length of the new ranges array (which
will be shorter than len). */
uint32_t new_len = 0;
uint32_t new_end = 0;
for (int idx = 0; idx < len; idx++)
{
TSRange range = ranges[idx];
/* If this range starts after visible_end, we don't include this
range and the ranges after it in the new ranges. */
if (range.start_byte + visible_beg >= visible_end)
try to do minimal work so that the ranges is minimally correct and
there's no OOB error. Usually treesit-update-ranges should update
the parser with semantically correct ranges.
We start with the charpos ranges, because for bytepos ranges, after
user edits, the ranges start/end might end up inside a multibyte
char! See (ref:bytepos-range-pitfall) below. */
Lisp_Object lisp_ranges = XTS_PARSER (parser)->last_set_ranges;
if (NILP (lisp_ranges)) return;
Lisp_Object new_ranges_head = lisp_ranges;
FOR_EACH_TAIL_SAFE (lisp_ranges)
{
Lisp_Object range = XCAR (lisp_ranges);
ptrdiff_t beg = XFIXNUM (XCAR (range));
ptrdiff_t end = XFIXNUM (XCDR (range));
if (end <= visible_beg)
/* Even the end is before visible_beg, discard this range. */
new_ranges_head = XCDR (new_ranges_head);
else if (beg >= visible_end)
{
/* Even the beg is after visible_end, dicard this range and all
the ranges after it. */
XSETCDR (range, Qnil);
break;
/* If this range's end is after visible_end, we don't include any
ranges after it, and changes the end of this range to
visible_end. */
if (range.end_byte + visible_beg > visible_end)
{
new_end = visible_end - visible_beg;
new_len++;
break;
}
new_len++;
}
if (new_len != len || new_end != 0)
}
else
{
/* At this point, the range overlaps with the visible portion of
the buffer in some way (in front / in back / completely
encased / completely encases). */
if (beg < visible_beg)
XSETCAR (range, make_fixnum (visible_beg));
if (end > visible_end)
XSETCDR (range, make_fixnum (visible_end));
}
}
XTS_PARSER (parser)->last_set_ranges = new_ranges_head;
if (NILP (new_ranges_head))
{
TSRange *new_ranges = xmalloc (sizeof (TSRange) * new_len);
memcpy (new_ranges, ranges, sizeof (TSRange) * new_len);
new_ranges[new_len - 1].end_byte = new_end;
/* TODO: What should we do if this fails? */
ts_parser_set_included_ranges (XTS_PARSER (parser)->parser,
new_ranges, new_len);
xfree (new_ranges);
bool success;
success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser,
NULL, 0);
eassert (success);
}
else
{
uint32_t len = 0;
TSRange *ts_ranges = treesit_make_ts_ranges (new_ranges_head, parser,
&len);
bool success;
success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser,
ts_ranges, len);
xfree (ts_ranges);
eassert (success);
}
}
/* (ref:bytepos-range-pitfall) Suppose we have the following buffer
content ([ ] is a unibyte char, [ ] is a multibyte char):
[a][b][c][d][e][ f ]
and the following ranges (denoted by braces):
[a][b][c][d][e][ f ]
{ }{ }
So far so good, now user deletes a unibyte char at the beginning:
[b][c][d][e][ f ]
{ }{ }
Oops, now our range cuts into the multibyte char, bad! */
static void
treesit_check_buffer_size (struct buffer *buffer)
{
@ -1228,7 +1268,12 @@ treesit_read_buffer (void *parser, uint32_t byte_index,
beg = NULL;
len = 0;
}
/* Normal case, read a character. */
/* Normal case, read a character. We can't give tree-sitter the
whole buffer range because we move the gap around, realloc the
buffer, etc; and there's no way to invalidate the previously
given range in tree-sitter. Move over, benchmark shows there's
very little difference between passing a whole chunk vs passing a
single char at once. The only cost is funcall I guess. */
else
{
beg = (char *) BUF_BYTE_ADDRESS (buffer, byte_pos);
@ -1739,6 +1784,48 @@ treesit_make_ranges (const TSRange *ranges, uint32_t len,
return Fnreverse (list);
}
/* Convert lisp ranges to tree-sitter ranges. Set LEN to the length of
the ranges. RANGES must be a valid ranges list, (cons of numbers, no
overlap, etc). PARSER must be a parser. This function doesn't check
for types. Caller must free the returned ranges. */
static TSRange *
treesit_make_ts_ranges (Lisp_Object ranges, Lisp_Object parser, uint32_t *len)
{
ptrdiff_t ranges_len = list_length (ranges);
if (ranges_len > UINT32_MAX)
xsignal (Qargs_out_of_range, list2 (ranges, Flength (ranges)));
*len = (uint32_t) ranges_len;
TSRange *treesit_ranges = xmalloc (sizeof (TSRange) * ranges_len);
struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer);
for (int idx = 0; idx < ranges_len; idx++, ranges = XCDR (ranges))
{
Lisp_Object range = XCAR (ranges);
ptrdiff_t beg_byte = buf_charpos_to_bytepos (buffer,
XFIXNUM (XCAR (range)));
ptrdiff_t end_byte = buf_charpos_to_bytepos (buffer,
XFIXNUM (XCDR (range)));
/* Shouldn't violate assertion since we just checked for
buffer size at the beginning of this function. */
eassert (beg_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX);
eassert (end_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX);
/* We don't care about points, put in dummy values. */
TSRange rg =
{
{0, 0}, {0, 0},
(uint32_t) beg_byte - XTS_PARSER (parser)->visible_beg,
(uint32_t) end_byte - XTS_PARSER (parser)->visible_beg
};
treesit_ranges[idx] = rg;
}
return treesit_ranges;
}
DEFUN ("treesit-parser-set-included-ranges",
Ftreesit_parser_set_included_ranges,
Streesit_parser_set_included_ranges,
@ -1778,33 +1865,8 @@ buffer. */)
}
else
{
/* Set ranges for PARSER. */
if (list_length (ranges) > UINT32_MAX)
xsignal (Qargs_out_of_range, list2 (ranges, Flength (ranges)));
uint32_t len = (uint32_t) list_length (ranges);
TSRange *treesit_ranges = xmalloc (sizeof (TSRange) * len);
struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer);
/* We can use XFIXNUM, XCAR, XCDR freely because we have checked
the input by treesit_check_range_argument. */
for (int idx = 0; !NILP (ranges); idx++, ranges = XCDR (ranges))
{
Lisp_Object range = XCAR (ranges);
ptrdiff_t beg_byte = buf_charpos_to_bytepos (buffer,
XFIXNUM (XCAR (range)));
ptrdiff_t end_byte = buf_charpos_to_bytepos (buffer,
XFIXNUM (XCDR (range)));
/* Shouldn't violate assertion since we just checked for
buffer size at the beginning of this function. */
eassert (beg_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX);
eassert (end_byte - BUF_BEGV_BYTE (buffer) <= UINT32_MAX);
/* We don't care about start and end points, put in dummy
values. */
TSRange rg = {{0, 0}, {0, 0},
(uint32_t) beg_byte - BUF_BEGV_BYTE (buffer),
(uint32_t) end_byte - BUF_BEGV_BYTE (buffer)};
treesit_ranges[idx] = rg;
}
uint32_t len = 0;
TSRange *treesit_ranges = treesit_make_ts_ranges (ranges, parser, &len);
success = ts_parser_set_included_ranges (XTS_PARSER (parser)->parser,
treesit_ranges, len);
xfree (treesit_ranges);
@ -1831,26 +1893,9 @@ See also `treesit-parser-set-included-ranges'. */)
treesit_check_parser (parser);
treesit_initialize ();
/* Our return value depends on the buffer state (BUF_BEGV_BYTE,
etc), so we need to sync up. */
treesit_check_buffer_size (XBUFFER (XTS_PARSER (parser)->buffer));
treesit_sync_visible_region (parser);
/* When the parser doesn't have a range set and we call
ts_parser_included_ranges on it, it doesn't return an empty list,
but rather return DEFAULT_RANGE. (A single range where start_byte
= 0, end_byte = UINT32_MAX). So we need to track whether the
parser is ranged ourselves. */
if (NILP (XTS_PARSER (parser)->last_set_ranges))
return Qnil;
uint32_t len;
const TSRange *ranges
= ts_parser_included_ranges (XTS_PARSER (parser)->parser, &len);
struct buffer *buffer = XBUFFER (XTS_PARSER (parser)->buffer);
return treesit_make_ranges (ranges, len, parser, buffer);
return XTS_PARSER (parser)->last_set_ranges;
}
DEFUN ("treesit-parser-notifiers", Ftreesit_parser_notifiers,

View file

@ -45,12 +45,23 @@ struct Lisp_TS_Parser
same tag. A tag is primarily used to differentiate between
parsers for the same language. */
Lisp_Object tag;
/* The Lisp ranges last set. This is use to compare to the new ranges
the users wants to set, and avoid reparse if the new ranges is the
same as the last set one. This might go out of sync with the
ranges we return from Ftreesit_parser_included_ranges, if we did a
ranges fix in treesit_sync_visible_region, but I don't think
that'll cause any harm. */
/* The Lisp ranges last set. One purpose for it is to compare to the
new ranges the users wants to set, and avoid reparse if the new
ranges is the same as the current one. Another purpose is to store
the ranges in charpos (ts api returns ranges in bytepos). We need
to use charpos so we don't end up having a range cut into a
multibyte character. (See (ref:bytepos-range-pitfall) in treesit.c
for more detail.)
treesit-parser-set-included-ranges sets this field;
treesit-parser-included-ranges directly returns this field, and
before each reparse, treesit_sync_visible_region uses this to
calculate a range for the parser that fits in the visible region.
Trivia: when the parser doesn't have a range set and we call
ts_parser_included_ranges on it, it doesn't return an empty list,
but rather return DEFAULT_RANGE. (A single range where start_byte
= 0, end_byte = UINT32_MAX). */
Lisp_Object last_set_ranges;
/* The buffer associated with this parser. */
Lisp_Object buffer;

View file

@ -684,6 +684,33 @@ visible_end.)"
(should (equal '((16 . 28)) (treesit-query-range
'javascript query nil nil '(1 . -1)))))))
(ert-deftest treesit-range-fixup-after-edit ()
"Tests if Emacs can fix OOB ranges after deleting text or narrowing."
(skip-unless (treesit-language-available-p 'json))
(with-temp-buffer
(let ((parser (treesit-parser-create 'json)))
(insert "11111111111111111111")
(treesit-parser-set-included-ranges parser '((1 . 20)))
(treesit-parser-root-node parser)
(should (equal (treesit-parser-included-ranges parser)
'((1 . 20))))
(narrow-to-region 5 15)
(should (equal (treesit-parser-included-ranges parser)
'((5 . 15))))
(widen)
;; Trickier ranges
;; 11111111111111111111
;; [ ] [ ]
;; { narrow }
(treesit-parser-set-included-ranges parser '((1 . 7) (10 . 15)))
(should (equal (treesit-parser-included-ranges parser)
'((1 . 7) (10 . 15))))
(narrow-to-region 5 13)
(should (equal (treesit-parser-included-ranges parser)
'((5 . 7) (10 . 13)))))))
;;; Multiple language
(ert-deftest treesit-multi-lang ()