Implement support for Ambiguous = Wide (#406)
Some checks are pending
CI / check (ubuntu-latest) (push) Waiting to run
CI / check (windows-latest) (push) Waiting to run

Does what it says on the tin. It's just a lot in the can.

Closes #115
This commit is contained in:
Leonard Hecker 2025-06-04 23:39:56 +02:00 committed by GitHub
parent db1e813603
commit f8bea2be19
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 1125 additions and 994 deletions

View file

@ -199,6 +199,13 @@ impl DocumentManager {
Ok(self.list.front_mut().unwrap())
}
pub fn reflow_all(&self) {
for doc in &self.list {
let mut tb = doc.buffer.borrow_mut();
tb.reflow();
}
}
pub fn open_for_reading(path: &Path) -> apperr::Result<File> {
File::open(path).map_err(apperr::Error::from)
}

View file

@ -23,12 +23,12 @@ use draw_menubar::*;
use draw_statusbar::*;
use edit::arena::{self, Arena, ArenaString, scratch_arena};
use edit::framebuffer::{self, IndexedColor};
use edit::helpers::{KIBI, MEBI, MetricFormatter, Rect, Size};
use edit::helpers::{CoordType, KIBI, MEBI, MetricFormatter, Rect, Size};
use edit::input::{self, kbmod, vk};
use edit::oklab::oklab_blend;
use edit::tui::*;
use edit::vt::{self, Token};
use edit::{apperr, arena_format, base64, path, sys};
use edit::{apperr, arena_format, base64, path, sys, unicode};
use localization::*;
use state::*;
@ -79,7 +79,7 @@ fn run() -> apperr::Result<()> {
let mut input_parser = input::Parser::new();
let mut tui = Tui::new()?;
let _restore = setup_terminal(&mut tui, &mut vt_parser);
let _restore = setup_terminal(&mut tui, &mut state, &mut vt_parser);
state.menubar_color_bg = oklab_blend(
tui.indexed(IndexedColor::Background),
@ -502,7 +502,7 @@ impl Drop for RestoreModes {
}
}
fn setup_terminal(tui: &mut Tui, vt_parser: &mut vt::Parser) -> RestoreModes {
fn setup_terminal(tui: &mut Tui, state: &mut State, vt_parser: &mut vt::Parser) -> RestoreModes {
sys::write_stdout(concat!(
// 1049: Alternative Screen Buffer
// I put the ASB switch in the beginning, just in case the terminal performs
@ -517,6 +517,12 @@ fn setup_terminal(tui: &mut Tui, vt_parser: &mut vt::Parser) -> RestoreModes {
"\x1b]4;8;?;9;?;10;?;11;?;12;?;13;?;14;?;15;?\x07",
// OSC 10 and 11 queries for the current foreground and background colors.
"\x1b]10;?\x07\x1b]11;?\x07",
// Test whether ambiguous width characters are two columns wide.
// We use "…", because it's the most common ambiguous width character we use,
// and the old Windows conhost doesn't actually use wcwidth, it measures the
// actual display width of the character and assigns it columns accordingly.
// We detect it by writing the character and asking for the cursor position.
"\r\x1b[6n",
// CSI c reports the terminal capabilities.
// It also helps us to detect the end of the responses, because not all
// terminals support the OSC queries, but all of them support CSI c.
@ -527,6 +533,7 @@ fn setup_terminal(tui: &mut Tui, vt_parser: &mut vt::Parser) -> RestoreModes {
let mut osc_buffer = String::new();
let mut indexed_colors = framebuffer::DEFAULT_THEME;
let mut color_responses = 0;
let mut ambiguous_width = 1;
while !done {
let scratch = scratch_arena(None);
@ -537,7 +544,12 @@ fn setup_terminal(tui: &mut Tui, vt_parser: &mut vt::Parser) -> RestoreModes {
let mut vt_stream = vt_parser.parse(&input);
while let Some(token) = vt_stream.next() {
match token {
Token::Csi(state) if state.final_byte == 'c' => done = true,
Token::Csi(csi) => match csi.final_byte {
'c' => done = true,
// CPR (Cursor Position Report) response.
'R' => ambiguous_width = csi.params[1] as CoordType - 1,
_ => {}
},
Token::Osc { mut data, partial } => {
if partial {
osc_buffer.push_str(data);
@ -594,6 +606,11 @@ fn setup_terminal(tui: &mut Tui, vt_parser: &mut vt::Parser) -> RestoreModes {
}
}
if ambiguous_width == 2 {
unicode::setup_ambiguous_width(2);
state.documents.reflow_all();
}
if color_responses == indexed_colors.len() {
tui.setup_indexed_colors(indexed_colors);
}

View file

@ -427,7 +427,7 @@ impl TextBuffer {
false
} else {
self.margin_enabled = enabled;
self.reflow(true);
self.reflow();
true
}
}
@ -482,7 +482,7 @@ impl TextBuffer {
false
} else {
self.width = width;
self.reflow(true);
self.reflow();
true
}
}
@ -499,7 +499,7 @@ impl TextBuffer {
false
} else {
self.tab_size = width;
self.reflow(true);
self.reflow();
true
}
}
@ -524,7 +524,7 @@ impl TextBuffer {
self.ruler = column;
}
fn reflow(&mut self, force: bool) {
pub fn reflow(&mut self) {
// +1 onto logical_lines, because line numbers are 1-based.
// +1 onto log10, because we want the digit width and not the actual log10.
// +3 onto log10, because we append " | " to the line numbers to form the margin.
@ -536,24 +536,25 @@ impl TextBuffer {
let text_width = self.text_width();
// 2 columns are required, because otherwise wide glyphs wouldn't ever fit.
let word_wrap_column =
self.word_wrap_column =
if self.word_wrap_enabled && text_width >= 2 { text_width } else { 0 };
if force || self.word_wrap_column > word_wrap_column {
self.word_wrap_column = word_wrap_column;
if self.cursor.offset != 0 {
self.cursor = self
.cursor_move_to_logical_internal(Default::default(), self.cursor.logical_pos);
}
// Recalculate the line statistics.
if self.word_wrap_enabled {
let end = self.cursor_move_to_logical_internal(self.cursor, Point::MAX);
self.stats.visual_lines = end.visual_pos.y + 1;
// Recalculate the cursor position.
self.cursor = self.cursor_move_to_logical_internal(
if self.word_wrap_column > 0 {
Default::default()
} else {
self.stats.visual_lines = self.stats.logical_lines;
}
self.goto_line_start(self.cursor, self.cursor.logical_pos.y)
},
self.cursor.logical_pos,
);
// Recalculate the line statistics.
if self.word_wrap_column > 0 {
let end = self.cursor_move_to_logical_internal(self.cursor, Point::MAX);
self.stats.visual_lines = end.visual_pos.y + 1;
} else {
self.stats.visual_lines = self.stats.logical_lines;
}
self.cursor_for_rendering = None;
@ -583,7 +584,7 @@ impl TextBuffer {
self.set_selection(None);
self.search = None;
self.mark_as_clean();
self.reflow(true);
self.reflow();
}
/// Copies the contents of the buffer into a string.
@ -2312,9 +2313,7 @@ impl TextBuffer {
}
self.search = None;
// Also takes care of clearing `cursor_for_rendering`.
self.reflow(false);
self.cursor_for_rendering = None;
}
/// Undo the last edit operation.
@ -2428,8 +2427,7 @@ impl TextBuffer {
}
}
// Also takes care of clearing `cursor_for_rendering`.
self.reflow(false);
self.cursor_for_rendering = None;
}
/// For interfacing with ICU.

View file

@ -9,6 +9,25 @@ use crate::document::ReadableDocument;
use crate::helpers::{CoordType, Point};
use crate::simd::{memchr2, memrchr2};
// On one hand it's disgusting that I wrote this as a global variable, but on the
// other hand, this isn't a public library API, and it makes the code a lot cleaner,
// because we don't need to inject this once-per-process value everywhere.
static mut AMBIGUOUS_WIDTH: usize = 1;
/// Sets the width of "ambiguous" width characters as per "UAX #11: East Asian Width".
///
/// Defaults to 1.
pub fn setup_ambiguous_width(ambiguous_width: CoordType) {
unsafe { AMBIGUOUS_WIDTH = ambiguous_width as usize };
}
#[inline]
fn ambiguous_width() -> usize {
// SAFETY: This is a global variable that is set once per process.
// It is never changed after that, so this is safe to call.
unsafe { AMBIGUOUS_WIDTH }
}
/// Stores a position inside a [`ReadableDocument`].
///
/// The cursor tracks both the absolute byte-offset,
@ -40,16 +59,25 @@ pub struct Cursor {
/// Your entrypoint to navigating inside a [`ReadableDocument`].
#[derive(Clone)]
pub struct MeasurementConfig<'doc> {
buffer: &'doc dyn ReadableDocument,
cursor: Cursor,
tab_size: CoordType,
word_wrap_column: CoordType,
cursor: Cursor,
buffer: &'doc dyn ReadableDocument,
}
impl<'doc> MeasurementConfig<'doc> {
/// Creates a new [`MeasurementConfig`] for the given document.
pub fn new(buffer: &'doc dyn ReadableDocument) -> Self {
Self { buffer, tab_size: 8, word_wrap_column: 0, cursor: Default::default() }
Self { cursor: Default::default(), tab_size: 8, word_wrap_column: 0, buffer }
}
/// Sets the initial cursor to the given position.
///
/// WARNING: While the code doesn't panic if the cursor is invalid,
/// the results will obviously be complete garbage.
pub fn with_cursor(mut self, cursor: Cursor) -> Self {
self.cursor = cursor;
self
}
/// Sets the tab size.
@ -68,31 +96,13 @@ impl<'doc> MeasurementConfig<'doc> {
self
}
/// Sets the initial cursor to the given position.
///
/// WARNING: While the code doesn't panic if the cursor is invalid,
/// the results will obviously be complete garbage.
pub fn with_cursor(mut self, cursor: Cursor) -> Self {
self.cursor = cursor;
self
}
/// Navigates **forward** to the given absolute offset.
///
/// # Returns
///
/// The cursor position after the navigation.
pub fn goto_offset(&mut self, offset: usize) -> Cursor {
self.cursor = Self::measure_forward(
self.tab_size,
self.word_wrap_column,
offset,
Point::MAX,
Point::MAX,
self.cursor,
self.buffer,
);
self.cursor
self.measure_forward(offset, Point::MAX, Point::MAX)
}
/// Navigates **forward** to the given logical position.
@ -103,16 +113,7 @@ impl<'doc> MeasurementConfig<'doc> {
///
/// The cursor position after the navigation.
pub fn goto_logical(&mut self, logical_target: Point) -> Cursor {
self.cursor = Self::measure_forward(
self.tab_size,
self.word_wrap_column,
usize::MAX,
logical_target,
Point::MAX,
self.cursor,
self.buffer,
);
self.cursor
self.measure_forward(usize::MAX, logical_target, Point::MAX)
}
/// Navigates **forward** to the given visual position.
@ -123,16 +124,7 @@ impl<'doc> MeasurementConfig<'doc> {
///
/// The cursor position after the navigation.
pub fn goto_visual(&mut self, visual_target: Point) -> Cursor {
self.cursor = Self::measure_forward(
self.tab_size,
self.word_wrap_column,
usize::MAX,
Point::MAX,
visual_target,
self.cursor,
self.buffer,
);
self.cursor
self.measure_forward(usize::MAX, Point::MAX, visual_target)
}
/// Returns the current cursor position.
@ -149,27 +141,24 @@ impl<'doc> MeasurementConfig<'doc> {
// the wrap exists on both lines and it'll default to wrapping. `goto_visual` however will always
// try to return a Y position that matches the requested position, so that Home/End works properly.
fn measure_forward(
tab_size: CoordType,
word_wrap_column: CoordType,
&mut self,
offset_target: usize,
logical_target: Point,
visual_target: Point,
cursor: Cursor,
buffer: &dyn ReadableDocument,
) -> Cursor {
if cursor.offset >= offset_target
|| cursor.logical_pos >= logical_target
|| cursor.visual_pos >= visual_target
if self.cursor.offset >= offset_target
|| self.cursor.logical_pos >= logical_target
|| self.cursor.visual_pos >= visual_target
{
return cursor;
return self.cursor;
}
let mut offset = cursor.offset;
let mut logical_pos_x = cursor.logical_pos.x;
let mut logical_pos_y = cursor.logical_pos.y;
let mut visual_pos_x = cursor.visual_pos.x;
let mut visual_pos_y = cursor.visual_pos.y;
let mut column = cursor.column;
let mut offset = self.cursor.offset;
let mut logical_pos_x = self.cursor.logical_pos.x;
let mut logical_pos_y = self.cursor.logical_pos.y;
let mut visual_pos_x = self.cursor.visual_pos.x;
let mut visual_pos_y = self.cursor.visual_pos.y;
let mut column = self.cursor.column;
let mut logical_target_x = Self::calc_target_x(logical_target, logical_pos_y);
let mut visual_target_x = Self::calc_target_x(visual_target, visual_pos_y);
@ -177,7 +166,7 @@ impl<'doc> MeasurementConfig<'doc> {
// wrap_opp = Wrap Opportunity
// These store the position and column of the last wrap opportunity. If `word_wrap_column` is
// zero (word wrap disabled), all grapheme clusters are a wrap opportunity, because none are.
let mut wrap_opp = cursor.wrap_opp;
let mut wrap_opp = self.cursor.wrap_opp;
let mut wrap_opp_offset = offset;
let mut wrap_opp_logical_pos_x = logical_pos_x;
let mut wrap_opp_visual_pos_x = visual_pos_x;
@ -209,7 +198,7 @@ impl<'doc> MeasurementConfig<'doc> {
loop {
if !chunk_iter.has_next() {
cold_path();
chunk_iter = Utf8Chars::new(buffer.read_forward(chunk_range.end), 0);
chunk_iter = Utf8Chars::new(self.buffer.read_forward(chunk_range.end), 0);
chunk_range = chunk_range.end..chunk_range.end + chunk_iter.len();
}
@ -219,7 +208,8 @@ impl<'doc> MeasurementConfig<'doc> {
// Similar applies to the width.
props_last_char = props_next_cluster;
offset_next_cluster = chunk_range.start + chunk_iter.offset();
width += ucd_grapheme_cluster_character_width(props_next_cluster) as CoordType;
width += ucd_grapheme_cluster_character_width(props_next_cluster, ambiguous_width())
as CoordType;
// The `Document::read_forward` interface promises us that it will not split
// grapheme clusters across chunks. Therefore, we can safely break here.
@ -252,10 +242,10 @@ impl<'doc> MeasurementConfig<'doc> {
// Tabs require special handling because they can have a variable width.
if props_last_char == ucd_tab_properties() {
// SAFETY: `tab_size` is clamped to >= 1 in `with_tab_size`.
// SAFETY: `self.tab_size` is clamped to >= 1 in `with_tab_size`.
// This assert ensures that Rust doesn't insert panicking null checks.
unsafe { std::hint::assert_unchecked(tab_size >= 1) };
width = tab_size - (column % tab_size);
unsafe { std::hint::assert_unchecked(self.tab_size >= 1) };
width = self.tab_size - (column % self.tab_size);
}
// Hard wrap: Both the logical and visual position advance by one line.
@ -290,7 +280,7 @@ impl<'doc> MeasurementConfig<'doc> {
// Since this code above may need to revert to a previous `wrap_opp_*`,
// it must be done before advancing / checking for `ucd_line_break_joins`.
if word_wrap_column > 0 && visual_pos_x + width > word_wrap_column {
if self.word_wrap_column > 0 && visual_pos_x + width > self.word_wrap_column {
if !wrap_opp {
// Otherwise, the lack of a wrap opportunity means that a single word
// is wider than the word wrap column. We need to force-break the word.
@ -342,7 +332,7 @@ impl<'doc> MeasurementConfig<'doc> {
visual_pos_x += width;
column += width;
if word_wrap_column > 0
if self.word_wrap_column > 0
&& !ucd_line_break_joins(props_current_cluster, props_next_cluster)
{
wrap_opp = true;
@ -355,7 +345,7 @@ impl<'doc> MeasurementConfig<'doc> {
// If we're here, we hit our target. Now the only question is:
// Is the word we're currently on so wide that it will be wrapped further down the document?
if word_wrap_column > 0 {
if self.word_wrap_column > 0 {
if !wrap_opp {
// If the current laid-out line had no wrap opportunities, it means we had an input
// such as "fooooooooooooooooooooo" at a `word_wrap_column` of e.g. 10. The word
@ -386,7 +376,8 @@ impl<'doc> MeasurementConfig<'doc> {
loop {
if !chunk_iter.has_next() {
cold_path();
chunk_iter = Utf8Chars::new(buffer.read_forward(chunk_range.end), 0);
chunk_iter =
Utf8Chars::new(self.buffer.read_forward(chunk_range.end), 0);
chunk_range = chunk_range.end..chunk_range.end + chunk_iter.len();
}
@ -396,8 +387,10 @@ impl<'doc> MeasurementConfig<'doc> {
// Similar applies to the width.
props_last_char = props_next_cluster;
offset_next_cluster = chunk_range.start + chunk_iter.offset();
width +=
ucd_grapheme_cluster_character_width(props_next_cluster) as CoordType;
width += ucd_grapheme_cluster_character_width(
props_next_cluster,
ambiguous_width(),
) as CoordType;
// The `Document::read_forward` interface promises us that it will not split
// grapheme clusters across chunks. Therefore, we can safely break here.
@ -431,10 +424,10 @@ impl<'doc> MeasurementConfig<'doc> {
// Tabs require special handling because they can have a variable width.
if props_last_char == ucd_tab_properties() {
// SAFETY: `tab_size` is clamped to >= 1 in `with_tab_size`.
// SAFETY: `self.tab_size` is clamped to >= 1 in `with_tab_size`.
// This assert ensures that Rust doesn't insert panicking null checks.
unsafe { std::hint::assert_unchecked(tab_size >= 1) };
width = tab_size - (column % tab_size);
unsafe { std::hint::assert_unchecked(self.tab_size >= 1) };
width = self.tab_size - (column % self.tab_size);
}
// Hard wrap: Both the logical and visual position advance by one line.
@ -444,7 +437,7 @@ impl<'doc> MeasurementConfig<'doc> {
visual_pos_x_lookahead += width;
if visual_pos_x_lookahead > word_wrap_column {
if visual_pos_x_lookahead > self.word_wrap_column {
visual_pos_x -= wrap_opp_visual_pos_x;
visual_pos_y += 1;
break;
@ -467,13 +460,12 @@ impl<'doc> MeasurementConfig<'doc> {
}
}
Cursor {
offset,
logical_pos: Point { x: logical_pos_x, y: logical_pos_y },
visual_pos: Point { x: visual_pos_x, y: visual_pos_y },
column,
wrap_opp,
}
self.cursor.offset = offset;
self.cursor.logical_pos = Point { x: logical_pos_x, y: logical_pos_y };
self.cursor.visual_pos = Point { x: visual_pos_x, y: visual_pos_y };
self.cursor.column = column;
self.cursor.wrap_opp = wrap_opp;
self.cursor
}
#[inline]

File diff suppressed because it is too large Load diff

View file

@ -247,12 +247,15 @@ Usage: grapheme-table-gen [options...] <ucd.nounihan.grouped.xml>
Expose tab and linefeed as grapheme cluster properties
--no-ambiguous Treat all ambiguous characters as narrow
--line-breaks Store and expose line break information
Download ucd.nounihan.grouped.xml at:
https://www.unicode.org/Public/UCD/latest/ucdxml/ucd.nounihan.grouped.zip
";
fn main() -> anyhow::Result<()> {
let mut args = pico_args::Arguments::from_env();
if args.contains(["-h", "--help"]) {
eprint!("{}", HELP);
eprint!("{HELP}");
return Ok(());
}
@ -310,7 +313,7 @@ fn main() -> anyhow::Result<()> {
for s in &out.trie.stages {
actual = s.values[actual as usize + ((cp >> s.shift) & s.mask)];
}
assert_eq!(expected.value(), actual, "trie sanity check failed for U+{:04X}", cp);
assert_eq!(expected.value(), actual, "trie sanity check failed for U+{cp:04X}");
}
for (cp, &expected) in out.ucd.values[..0x80].iter().enumerate() {
let last = out.trie.stages.last().unwrap();
@ -318,8 +321,7 @@ fn main() -> anyhow::Result<()> {
assert_eq!(
expected.value(),
actual,
"trie sanity check failed for direct ASCII mapping of U+{:04X}",
cp
"trie sanity check failed for direct ASCII mapping of U+{cp:04X}"
);
}
@ -372,7 +374,7 @@ fn generate_c(out: Output) -> String {
for table in &out.rules_gc {
buf.push_str(" {\n");
for &r in table {
_ = writeln!(buf, " 0b{:032b},", r);
_ = writeln!(buf, " 0b{r:032b},");
}
buf.push_str(" },\n");
}
@ -443,15 +445,38 @@ fn generate_c(out: Output) -> String {
{{
return state == 3;
}}
inline int ucd_grapheme_cluster_character_width(const int val)
{{
return val >> {1};
}}
",
out.ucd.packing.mask_cluster_break,
out.ucd.packing.shift_character_width,
);
if out.arg_no_ambiguous {
_ = writedoc!(
buf,
"
inline int ucd_grapheme_cluster_character_width(const int val)
{{
return val >> {};
}}
",
out.ucd.packing.shift_character_width,
);
} else {
_ = writedoc!(
buf,
"
inline int ucd_grapheme_cluster_character_width(const int val, int ambiguous_width)
{{
int w = val >> {};
if (w == 3) {{
w = ambiguous_width;
}}
return w;
}}
",
out.ucd.packing.shift_character_width,
);
}
if out.arg_line_breaks {
_ = writedoc!(
buf,
@ -546,7 +571,7 @@ fn generate_rust(out: Output) -> String {
for table in &out.rules_gc {
buf.push_str(" [\n");
for &r in table {
_ = writeln!(buf, " 0b{:032b},", r);
_ = writeln!(buf, " 0b{r:032b},");
}
buf.push_str(" ],\n");
}
@ -622,15 +647,43 @@ fn generate_rust(out: Output) -> String {
pub fn ucd_grapheme_cluster_joins_done(state: u32) -> bool {{
state == 3
}}
#[inline(always)]
pub fn ucd_grapheme_cluster_character_width(val: usize) -> usize {{
val >> {1}
}}
",
out.ucd.packing.mask_cluster_break,
out.ucd.packing.shift_character_width,
);
if out.arg_no_ambiguous {
_ = writedoc!(
buf,
"
#[inline(always)]
pub fn ucd_grapheme_cluster_character_width(val: usize) -> usize {{
val >> {}
}}
",
out.ucd.packing.shift_character_width,
);
} else {
// `cold_path()` ensures that LLVM emits a branch instead of a conditional move.
// This improves performance, as ambiguous characters are rare.
// `> 2` is used instead of `== 3`, because this way the compiler can immediately
// test whether `val > (2 << shift_character_width)` before shifting.
_ = writedoc!(
buf,
"
#[inline(always)]
pub fn ucd_grapheme_cluster_character_width(val: usize, ambiguous_width: usize) -> usize {{
let mut w = val >> {};
if w > 2 {{
cold_path();
w = ambiguous_width;
}}
w
}}
",
out.ucd.packing.shift_character_width,
);
}
if out.arg_line_breaks {
_ = writedoc!(
buf,
@ -681,6 +734,17 @@ fn generate_rust(out: Output) -> String {
);
}
if !out.arg_no_ambiguous {
_ = writedoc!(
buf,
"
#[cold]
#[inline(always)]
fn cold_path() {{}}
"
);
}
buf.push_str("// END: Generated by grapheme-table-gen\n");
buf
}