edit/src/ucd.rs
2025-03-19 03:13:50 +01:00

705 lines
22 KiB
Rust
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use crate::helpers::{CoordType, Point};
use crate::memchr::{memchr2, memrchr2};
use crate::ucd_gen::*;
use crate::utf8::Utf8Chars;
use std::cmp::Ordering;
pub trait Document {
fn read_backward(&self, off: usize) -> &[u8];
fn read_forward(&self, off: usize) -> &[u8];
}
impl Document for &[u8] {
fn read_backward(&self, off: usize) -> &[u8] {
let s = *self;
&s[..off.min(s.len())]
}
fn read_forward(&self, off: usize) -> &[u8] {
let s = *self;
&s[off.min(s.len())..]
}
}
#[derive(Clone, Copy, Default)]
pub struct UcdCursor {
/// Offset in bytes within the buffer.
pub offset: usize,
/// Position in the buffer in lines (.y) and grapheme clusters (.x).
/// Line wrapping has NO influence on this.
pub logical_pos: Point,
/// Position in the buffer in laid out rows (.y) and columns (.x).
/// Line wrapping has an influence on this.
pub visual_pos: Point,
/// Horizontal position in visual columns.
/// Line wrapping has NO influence on this and if word wrap is disabled,
/// it's identical to `visual_pos.x`. This is useful for calculating tab widths.
pub column: CoordType,
}
pub struct WrapOpportunity {
absolute_offset: usize,
offset_next_cluster: usize,
props_next_cluster: usize,
logical_pos_x: CoordType,
}
pub struct MeasurementConfig<'doc> {
buffer: &'doc dyn Document,
tab_size: CoordType,
word_wrap_column: CoordType,
cursor: UcdCursor,
}
impl<'doc> MeasurementConfig<'doc> {
pub fn new(buffer: &'doc dyn Document) -> Self {
Self {
buffer,
tab_size: 8,
word_wrap_column: CoordType::MAX,
cursor: UcdCursor::default(),
}
}
pub fn with_tab_size(mut self, tab_size: CoordType) -> Self {
self.tab_size = tab_size;
self
}
pub fn with_word_wrap_column(mut self, word_wrap_column: CoordType) -> Self {
self.word_wrap_column = word_wrap_column;
self
}
pub fn with_cursor(mut self, cursor: UcdCursor) -> Self {
self.cursor = cursor;
self
}
pub fn goto_offset(&mut self, offset: usize) -> UcdCursor {
self.cursor = Self::measure_forward(
self.tab_size,
self.word_wrap_column,
offset,
Point::MAX,
Point::MAX,
self.cursor,
self.buffer,
);
self.cursor
}
pub fn goto_logical(&mut self, logical_target: Point) -> UcdCursor {
self.cursor = Self::measure_forward(
self.tab_size,
self.word_wrap_column,
usize::MAX,
logical_target,
Point::MAX,
self.cursor,
self.buffer,
);
self.cursor
}
pub fn goto_visual(&mut self, visual_target: Point) -> UcdCursor {
self.cursor = Self::measure_forward(
self.tab_size,
self.word_wrap_column,
usize::MAX,
Point::MAX,
visual_target,
self.cursor,
self.buffer,
);
self.cursor
}
pub fn cursor(&self) -> UcdCursor {
self.cursor
}
fn measure_forward(
tab_size: CoordType,
word_wrap_column: CoordType,
offset_target: usize,
logical_target: Point,
visual_target: Point,
cursor: UcdCursor,
buffer: &dyn Document,
) -> UcdCursor {
if cursor.logical_pos >= logical_target || cursor.visual_pos >= visual_target {
return cursor;
}
let mut wrap: Option<WrapOpportunity> = None;
let mut hit: Option<UcdCursor> = None;
let mut absolute_offset = cursor.offset;
let mut logical_pos_x = cursor.logical_pos.x;
let mut logical_pos_y = cursor.logical_pos.y;
let mut visual_pos_x = cursor.visual_pos.x;
let mut visual_pos_y = cursor.visual_pos.y;
let mut column = cursor.column;
let (mut offset_target_x, mut logical_target_x, mut visual_target_x) = Self::recalc_target(
offset_target,
logical_target,
visual_target,
logical_pos_y,
visual_pos_y,
);
'outer: loop {
let chunk = buffer.read_forward(absolute_offset);
let chunk_beg = absolute_offset;
let chunk_end = absolute_offset + chunk.len();
let mut it = Utf8Chars::new(chunk, 0);
let Some(mut ch) = it.next() else {
break;
};
let mut props_next_cluster = ucd_grapheme_cluster_lookup(ch);
loop {
if absolute_offset >= chunk_end {
break;
}
if absolute_offset >= offset_target_x
|| logical_pos_x >= logical_target_x
|| visual_pos_x >= visual_target_x
{
if wrap.is_none() {
break 'outer;
}
hit = Some(UcdCursor {
offset: absolute_offset,
logical_pos: Point {
x: logical_pos_x,
y: logical_pos_y,
},
visual_pos: Point {
x: visual_pos_x,
y: visual_pos_y,
},
column,
});
// Prevent hits on the same line until we encounter a line wrap or explicit newline.
offset_target_x = usize::MAX;
logical_target_x = CoordType::MAX;
visual_target_x = CoordType::MAX;
}
let props_current_cluster = props_next_cluster;
let is_tab = ch == '\t';
let mut offset_next_cluster;
let mut width = 0;
let mut state = 0;
// Figure out the length and width of the rest of the grapheme cluster.
loop {
offset_next_cluster = it.offset();
width += ucd_grapheme_cluster_character_width(props_next_cluster) as CoordType;
let Some(ch_next) = it.next() else {
break;
};
ch = ch_next;
let props_trail = ucd_grapheme_cluster_lookup(ch);
state = ucd_grapheme_cluster_joins(state, props_next_cluster, props_trail);
props_next_cluster = props_trail;
if ucd_grapheme_cluster_joins_done(state) {
break;
}
}
let offset_next_cluster = chunk_beg + offset_next_cluster;
if is_tab {
// Tabs require special handling because they can have a variable width.
width = tab_size - (column % tab_size);
} else {
width = width.min(2);
}
// Hard wrap: Both the logical and visual position advance by one line.
if ucd_grapheme_cluster_is_newline(props_current_cluster) {
// Don't cross the newline if the target is on this line.
// E.g. if the callers asks for column 100 on a 10 column line,
// we'll return with the cursor set to column 10.
if logical_pos_y >= logical_target.y || visual_pos_y >= visual_target.y {
break 'outer;
}
logical_pos_x = 0;
logical_pos_y += 1;
visual_pos_x = 0;
visual_pos_y += 1;
column = 0;
// We moved the logical/visual pos past the newline,
// so we also need to move the offset past it.
absolute_offset = offset_next_cluster;
(offset_target_x, logical_target_x, visual_target_x) = Self::recalc_target(
offset_target,
logical_target,
visual_target,
logical_pos_y,
visual_pos_y,
);
continue;
}
// Line/word-wrap handling.
if word_wrap_column != CoordType::MAX && visual_pos_x + width > word_wrap_column {
// Reset to the last break opportunity, if there was any.
if let Some(ref w) = wrap {
absolute_offset = w.absolute_offset;
it.seek(w.offset_next_cluster);
props_next_cluster = w.props_next_cluster;
logical_pos_x = w.logical_pos_x;
}
// Wrap!
visual_pos_x = 0;
visual_pos_y += 1;
(offset_target_x, logical_target_x, visual_target_x) = Self::recalc_target(
offset_target,
logical_target,
visual_target,
logical_pos_y,
visual_pos_y,
);
wrap = None;
hit = None;
if absolute_offset < chunk_beg {
// We've had to reset to a point before this chunk,
// so we have to re-read the previous contents.
break;
}
continue;
}
// Avoid advancing past the visual target, because `width` can be greater than 1.
if visual_pos_x + width > visual_target_x {
if word_wrap_column == CoordType::MAX || wrap.is_none() {
break 'outer;
}
hit = Some(UcdCursor {
offset: absolute_offset,
logical_pos: Point {
x: logical_pos_x,
y: logical_pos_y,
},
visual_pos: Point {
x: visual_pos_x,
y: visual_pos_y,
},
column,
});
// Prevent hits on the same line until we encounter a line wrap or explicit newline.
offset_target_x = usize::MAX;
logical_target_x = CoordType::MAX;
visual_target_x = CoordType::MAX;
}
absolute_offset = offset_next_cluster;
logical_pos_x += 1;
visual_pos_x += width;
column += width;
if word_wrap_column != CoordType::MAX
&& !ucd_line_break_joins(props_current_cluster, props_next_cluster)
{
if hit.is_some() {
break 'outer;
}
wrap = Some(WrapOpportunity {
absolute_offset,
offset_next_cluster: it.offset(),
props_next_cluster,
logical_pos_x,
});
}
}
}
if visual_pos_x >= word_wrap_column {
visual_pos_x = 0;
visual_pos_y += 1;
}
if let Some(c) = hit {
return c;
}
UcdCursor {
offset: absolute_offset,
logical_pos: Point {
x: logical_pos_x,
y: logical_pos_y,
},
visual_pos: Point {
x: visual_pos_x,
y: visual_pos_y,
},
column,
}
}
#[inline]
fn recalc_target(
offset_target: usize,
logical_target: Point,
visual_target: Point,
logical_pos_y: CoordType,
visual_pos_y: CoordType,
) -> (usize, CoordType, CoordType) {
(
offset_target,
Self::target_column(logical_target, logical_pos_y),
Self::target_column(visual_target, visual_pos_y),
)
}
#[inline]
fn target_column(target: Point, y: CoordType) -> CoordType {
match y.cmp(&target.y) {
Ordering::Less => CoordType::MAX,
Ordering::Equal => target.x,
Ordering::Greater => 0,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum CharClass {
Whitespace,
Newline,
Separator,
Word,
}
const fn construct_classifier(seperators: &[u8]) -> [CharClass; 256] {
let mut classifier = [CharClass::Word; 256];
classifier[b' ' as usize] = CharClass::Whitespace;
classifier[b'\t' as usize] = CharClass::Whitespace;
classifier[b'\n' as usize] = CharClass::Newline;
classifier[b'\r' as usize] = CharClass::Newline;
let mut i = 0;
let len = seperators.len();
while i < len {
let ch = seperators[i];
assert!(ch < 128, "Only ASCII separators are supported.");
classifier[ch as usize] = CharClass::Separator;
i += 1;
}
classifier
}
const WORD_CLASSIFIER: [CharClass; 256] =
construct_classifier(br#"`~!@#$%^&*()-=+[{]}\|;:'",.<>/?"#);
/// Finds the next word boundary given a document cursor offset.
/// Returns the offset of the next word boundary.
pub fn word_forward(doc: &dyn Document, offset: usize) -> usize {
word_navigation(WordForward {
doc,
offset,
chunk: &[],
chunk_off: 0,
})
}
/// The backward version of `word_forward`.
pub fn word_backward(doc: &dyn Document, offset: usize) -> usize {
word_navigation(WordBackward {
doc,
offset,
chunk: &[],
chunk_off: 0,
})
}
/// Word navigation implementation. Matches the behavior of VS Code.
fn word_navigation<T: WordNavigation>(mut nav: T) -> usize {
// First skip one newline, if any.
nav.skip_newline();
// Skip any whitespace.
nav.skip_class(CharClass::Whitespace);
// Skip one word or seperator and take note of the class.
let class = nav.peek(CharClass::Whitespace);
if matches!(class, CharClass::Separator | CharClass::Word) {
nav.next();
let off = nav.offset();
// Continue skipping the same class.
nav.skip_class(class);
// If the class was a separator and we only moved one character,
// continue skipping characters of the word class.
if off == nav.offset() && class == CharClass::Separator {
nav.skip_class(CharClass::Word);
}
}
nav.offset()
}
trait WordNavigation {
fn skip_newline(&mut self);
fn skip_class(&mut self, class: CharClass);
fn peek(&self, default: CharClass) -> CharClass;
fn next(&mut self);
fn offset(&self) -> usize;
}
struct WordForward<'a> {
doc: &'a dyn Document,
offset: usize,
chunk: &'a [u8],
chunk_off: usize,
}
impl WordNavigation for WordForward<'_> {
fn skip_newline(&mut self) {
// We can rely on the fact that the document does not split graphemes across chunks.
// = If there's a newline it's wholly contained in this chunk.
if self.chunk_off < self.chunk.len() && self.chunk[self.chunk_off] == b'\r' {
self.chunk_off += 1;
}
if self.chunk_off < self.chunk.len() && self.chunk[self.chunk_off] == b'\n' {
self.chunk_off += 1;
}
}
fn skip_class(&mut self, class: CharClass) {
'outer: loop {
while self.chunk_off < self.chunk.len() {
if WORD_CLASSIFIER[self.chunk[self.chunk_off] as usize] != class {
break 'outer;
}
self.chunk_off += 1;
}
self.offset += self.chunk.len();
self.chunk = self.doc.read_forward(self.offset);
self.chunk_off = 0;
}
}
fn peek(&self, default: CharClass) -> CharClass {
if self.chunk_off < self.chunk.len() {
WORD_CLASSIFIER[self.chunk[self.chunk_off] as usize]
} else {
default
}
}
fn next(&mut self) {
self.chunk_off += 1;
}
fn offset(&self) -> usize {
self.offset + self.chunk_off
}
}
struct WordBackward<'a> {
doc: &'a dyn Document,
offset: usize,
chunk: &'a [u8],
chunk_off: usize,
}
impl WordNavigation for WordBackward<'_> {
fn skip_newline(&mut self) {
// We can rely on the fact that the document does not split graphemes across chunks.
// = If there's a newline it's wholly contained in this chunk.
if self.chunk_off > 0 && self.chunk[self.chunk_off - 1] == b'\r' {
self.chunk_off -= 1;
}
if self.chunk_off > 0 && self.chunk[self.chunk_off - 1] == b'\n' {
self.chunk_off -= 1;
}
}
fn skip_class(&mut self, class: CharClass) {
'outer: loop {
while self.chunk_off > 0 {
if WORD_CLASSIFIER[self.chunk[self.chunk_off - 1] as usize] != class {
break 'outer;
}
self.chunk_off -= 1;
}
self.offset -= self.chunk.len();
self.chunk = self.doc.read_backward(self.offset);
self.chunk_off = self.chunk.len();
}
}
fn peek(&self, default: CharClass) -> CharClass {
if self.chunk_off > 0 {
WORD_CLASSIFIER[self.chunk[self.chunk_off - 1] as usize]
} else {
default
}
}
fn next(&mut self) {
self.chunk_off -= 1;
}
fn offset(&self) -> usize {
self.offset - self.chunk.len() + self.chunk_off
}
}
pub fn newlines_forward(
text: &[u8],
mut offset: usize,
mut line: CoordType,
line_stop: CoordType,
) -> (usize, CoordType) {
// Leaving the cursor at the beginning of the current line when the limit
// is 0 makes this function behave identical to ucd_newlines_backward.
if line >= line_stop {
return newlines_backward(text, offset, line, line_stop);
}
let len = text.len();
offset = offset.min(len);
loop {
offset = memchr2(b'\r', b'\n', text, offset);
if offset >= len {
break;
}
let ch = text[offset];
offset += 1;
if ch == b'\r' && offset != len && text[offset] == b'\n' {
offset += 1;
}
line += 1;
if line >= line_stop {
break;
}
}
(offset, line)
}
// Seeks to the start of the given line.
// No matter what parameters are given, it only returns an offset at the start of a line.
// Put differently, even if `line == line_stop`, it'll seek backward to the line start.
pub fn newlines_backward(
text: &[u8],
mut offset: usize,
mut line: CoordType,
line_stop: CoordType,
) -> (usize, CoordType) {
offset = offset.min(text.len());
loop {
offset = match memrchr2(b'\r', b'\n', text, offset) {
Some(i) => i,
None => return (0, line),
};
if line <= line_stop {
// +1: Past the newline, at the start of the current line.
return (offset + 1, line);
}
if text[offset] == b'\n' && offset != 0 && text[offset - 1] == b'\r' {
offset -= 1;
}
line -= 1;
}
}
pub fn strip_newline(mut text: &[u8]) -> &[u8] {
// Rust generates surprisingly tight assembly for this.
if text.last() == Some(&b'\n') {
text = &text[..text.len() - 1];
}
if text.last() == Some(&b'\r') {
text = &text[..text.len() - 1];
}
text
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_measure_forward_newline_start() {
let cursor =
MeasurementConfig::new(&"foo\nbar".as_bytes()).goto_visual(Point { x: 0, y: 1 });
assert_eq!(cursor.offset, 4);
assert_eq!(cursor.logical_pos, Point { x: 0, y: 1 });
assert_eq!(cursor.visual_pos, Point { x: 0, y: 1 });
}
#[test]
fn test_measure_forward_clipped_wide_char() {
let cursor = MeasurementConfig::new(&"a😶🌫b".as_bytes()).goto_visual(Point { x: 2, y: 0 });
assert_eq!(cursor.offset, 1);
assert_eq!(cursor.logical_pos, Point { x: 1, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 1, y: 0 });
}
#[test]
fn test_measure_forward_word_wrap() {
// |foo␣ |
// |bar␣ |
// |baz |
let text = "foo bar \nbaz".as_bytes();
let cursor = MeasurementConfig::new(&text)
.with_word_wrap_column(6)
.goto_logical(Point { x: 5, y: 0 });
assert_eq!(cursor.offset, 5);
assert_eq!(cursor.logical_pos, Point { x: 5, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 1, y: 1 });
let mut cfg = MeasurementConfig::new(&text).with_word_wrap_column(6);
let cursor = cfg.goto_visual(Point { x: 5, y: 0 });
assert_eq!(cursor.offset, 4);
assert_eq!(cursor.logical_pos, Point { x: 4, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 0, y: 1 });
let cursor = cfg.goto_visual(Point { x: 0, y: 1 });
assert_eq!(cursor.offset, 4);
assert_eq!(cursor.logical_pos, Point { x: 4, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 0, y: 1 });
let cursor = cfg.goto_visual(Point { x: 100, y: 1 });
assert_eq!(cursor.offset, 8);
assert_eq!(cursor.logical_pos, Point { x: 8, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 4, y: 1 });
let cursor = cfg.goto_visual(Point { x: 0, y: 2 });
assert_eq!(cursor.offset, 9);
assert_eq!(cursor.logical_pos, Point { x: 0, y: 1 });
assert_eq!(cursor.visual_pos, Point { x: 0, y: 2 });
let cursor = cfg.goto_visual(Point { x: 100, y: 2 });
assert_eq!(cursor.offset, 12);
assert_eq!(cursor.logical_pos, Point { x: 3, y: 1 });
assert_eq!(cursor.visual_pos, Point { x: 3, y: 2 });
}
}