Initial import

This commit is contained in:
Leonard Hecker 2025-03-19 03:13:50 +01:00
commit f654981a07
37 changed files with 15264 additions and 0 deletions

48
.github/workflows/build.yml vendored Normal file
View file

@ -0,0 +1,48 @@
name: build
on:
push:
branches:
- main
env:
CARGO_TERM_COLOR: always
jobs:
build:
runs-on: windows-2022
strategy:
matrix:
toolchain:
- nightly
arch:
- x64
- arm64
steps:
# The Windows runners have autocrlf enabled by default.
- name: Disable git autocrlf
run: git config --global core.autocrlf false
- name: Checkout
uses: actions/checkout@v4
- name: Install nightly
run: |
rustup toolchain install --no-self-update --profile minimal --component rust-src -- nightly
rustup default nightly
rustup target add ${{ matrix.arch == 'arm64' && 'aarch64-pc-windows-msvc' || 'x86_64-pc-windows-msvc' }}
- name: Test
if: matrix.arch == 'x64'
run: cargo test
- name: Build
run: |
if ("${{ matrix.arch }}" -eq "arm64") {
.\tools\build_release_windows.bat --target aarch64-pc-windows-msvc
} else {
.\tools\build_release_windows.bat
}
- name: Upload
uses: actions/upload-artifact@v4
with:
name: Windows ${{ matrix.arch }}
path: |
${{ github.workspace }}/target/${{ matrix.arch == 'arm64' && 'aarch64-pc-windows-msvc/release' || 'release' }}/edit.exe
${{ github.workspace }}/target/${{ matrix.arch == 'arm64' && 'aarch64-pc-windows-msvc/release' || 'release' }}/edit.pdb

8
.gitignore vendored Normal file
View file

@ -0,0 +1,8 @@
.idea
.vs
*.user
bin
CMakeSettings.json
obj
out
target

17
.vscode/launch.json vendored Normal file
View file

@ -0,0 +1,17 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Launch Debug",
"preLaunchTask": "rust: cargo build",
"type": "cppvsdbg",
"request": "launch",
"console": "externalTerminal",
"program": "${workspaceFolder}/target/debug/edit",
"args": [
"${workspaceFolder}/README.md"
],
"cwd": "${workspaceFolder}",
}
]
}

24
.vscode/tasks.json vendored Normal file
View file

@ -0,0 +1,24 @@
{
"version": "2.0.0",
"tasks": [
{
"label": "rust: cargo build",
"type": "process",
"command": "cargo",
"args": [
"build",
"--package",
"edit",
"--features",
"debug-latency"
],
"group": {
"kind": "build",
"isDefault": true
},
"problemMatcher": [
"$rustc"
]
}
]
}

90
Cargo.lock generated Normal file
View file

@ -0,0 +1,90 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "edit"
version = "0.1.0"
dependencies = [
"libc",
"windows-sys",
]
[[package]]
name = "libc"
version = "0.2.171"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

34
Cargo.toml Normal file
View file

@ -0,0 +1,34 @@
[package]
name = "edit"
version = "0.1.0"
edition = "2024"
[features]
debug-layout = []
debug-latency = []
[profile.release]
codegen-units = 1
debug = "full"
lto = true
panic = "abort"
debug-assertions = true # Temporary while I test this
[dependencies]
[target.'cfg(unix)'.dependencies]
libc = "0.2"
[target.'cfg(windows)'.dependencies.windows-sys]
version = "0.59"
features = [
"Win32_Globalization",
"Win32_Security",
"Win32_Storage_FileSystem",
"Win32_System_Console",
"Win32_System_Diagnostics_Debug",
"Win32_System_IO",
"Win32_System_LibraryLoader",
"Win32_System_Memory",
"Win32_System_Threading",
]

21
LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) Microsoft Corporation. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

3
README.md Normal file
View file

@ -0,0 +1,3 @@
# MS-DOS Editor Redux
TBA

View file

@ -0,0 +1,26 @@
<!-- Source: https://commons.wikimedia.org/wiki/File:Microsoft_logo_(1980).svg -->
<!-- License: Public domain -->
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://creativecommons.org/ns#" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:svg="http://www.w3.org/2000/svg" xmlns="http://www.w3.org/2000/svg" id="svg8" version="1.1" viewBox="0 0 264.58333 52.916669" height="200" width="1000">
<defs id="defs2"/>
<metadata id="metadata5">
<rdf:RDF>
<cc:Work rdf:about="">
<dc:format>image/svg+xml</dc:format>
<dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
<dc:title/>
</cc:Work>
</rdf:RDF>
</metadata>
<g id="layer2">
<path style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" d="M 0,52.916667 33.602084,20.902084 V 34.925001 L 48.418751,20.902084 v 13.758334 h 8.73125 V 0.26458334 L 42.333334,15.08125 V 0.26458334 L 0,42.597917 Z" id="path847"/>
<path style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" d="M 67.468752,0.26458334 58.737501,9.2604169 V 34.660418 h 8.731251 z" id="path849"/>
<path transform="scale(0.26458334)" d="m 301.16016,1 c -21.9507,4.4255933 -39.58425,23.383151 -45.24024,48 H 255 V 53.673828 78.277344 82 h 0.69727 c 5.39479,25.07886 23.17116,44.48439 45.38085,49 H 343 v -30 h -20 v -0.004 C 322.83335,100.999 322.66667,101 322.5,101 303.44618,101 288,85.553824 288,66.5 288,47.446176 303.44618,32 322.5,32 H 342 L 372,1 Z" style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.567005;stop-color:#000000" id="path848"/>
<path transform="scale(0.26458334)" d="m 383,1 -33,34 v 96 h 33 V 33 h 18.5 c 9.66498,0 17.5,7.835017 17.5,17.5 0,9.664983 -7.83502,17.5 -17.5,17.5 H 387 L 521,199 V 157 L 487,123 443.33594,78.365234 A 47.000001,50 0 0 0 451,51 47.000001,50 0 0 0 405.00977,1.0117188 L 405,1 Z" style="display:inline;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.999999px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" id="path864"/>
<path transform="scale(0.26458334)" d="M 525.86523,1 A 68,66.499996 0 0 0 458,67.5 68,66.499996 0 0 0 526,134 68,66.499996 0 0 0 594,67.5 68,66.499996 0 0 0 526,1 68,66.499996 0 0 0 525.86523,1 Z m -1.60546,31 A 36.499998,36.000002 0 0 1 524.5,32 36.499998,36.000002 0 0 1 561,68 36.499998,36.000002 0 0 1 524.5,104 36.499998,36.000002 0 0 1 488,68 36.499998,36.000002 0 0 1 524.25977,32 Z" style="display:inline;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.567001;stop-color:#000000" id="path881"/>
<path transform="scale(0.26458334)" d="m 620.5,1 c -22.36753,-5.5e-7 -40.5,18.132467 -40.5,40.5 0,22.367533 18.13247,40.500001 40.5,40.5 h 2.5 c 11.59798,0 21,4.477153 21,10 0,5.522847 -9.40202,10 -21,10 h -40 v 29 h 62.99999 c 18.43887,-4.06734 31.56367,-20.13433 31.56446,-38.605479 C 677.56392,78.310576 669.87456,65.292316 657.38281,58.226562 640.78385,50.357003 632.38254,48.035667 620.15625,48 615.01343,46.201489 612.00162,43.42696 612,40.486328 611.9995,36.896878 616.47115,33.613784 623.55859,32 H 677 C 686.99999,16.999999 695.99998,9 709,1 h -84 z" style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.567001;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000;stop-opacity:1" id="rect942"/>
<path transform="scale(0.26458334)" d="M 743,0 A 68.999999,68.500001 0 0 0 674,68.5 68.999999,68.500001 0 0 0 743,137 68.999999,68.500001 0 0 0 812,68.5 68.999999,68.500001 0 0 0 743,0 Z m 0.5,32 A 37.499999,36.500002 0 0 1 781,68.5 37.499999,36.500002 0 0 1 743.5,105 37.499999,36.500002 0 0 1 706,68.5 37.499999,36.500002 0 0 1 743.5,32 Z" style="font-variation-settings:normal;opacity:1;vector-effect:none;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.567001;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;stop-color:#000000;stop-opacity:1" id="path881-7"/>
<path id="path1086" d="m 232.03959,22.754167 v -8.73125 h -8.46667 V 8.9958336 h 9.26042 l 8.73125,-8.73125026 H 223.83751 L 214.84167,9.2604169 V 52.652085 l 8.73125,-7.672917 V 22.754167 Z" style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"/>
<path style="fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1" d="m 251.88334,8.9958335 3.70414,-2e-7 8.7313,-8.73125014 h -20.10839 l -8.73122,8.73125034 h 7.67292 V 34.660417 l 8.7312,-7.672917 z" id="path1086-2"/>
</g>
</svg>

BIN
assets/microsoft.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.6 KiB

1
assets/microsoft.sixel Normal file
View file

@ -0,0 +1 @@
P;1q"1;1;300;60#0;2;100;100;100#0!42?_ow{}!12?_ow{}!6?_ow{}}!5?_ow{{}}}!17~^NFbpw{}!8~!4}{wwo_!12?_oow{{{!4}!6~!4}{{wwo__!4?_ow{{}}}!23~^Nfrxw{{}}}!9~!4}{{woo_!12?_ow{}!15~^NFbpw{}!17~^NFB@-!36?_ow{}!6~!6?_ow{}!6~??w{}!7~?o{}!10~^^!10NFBpw{}!6~!8N^!9~{_!4?_o{}!8~^^!9N^^!9~{w}!8~^!18NFbx{}!9~^^!8N^^!9~}{o???ow{}!6~!11NFB@GKM!5N!10~!4NFB@-!30?_ow{}!12~_ow{}!12~??!20~FB@!15?!10~!10?r!9~???{!8~NB@!15?@FN!16~!4{!4wooo__!5?_}!8~^FB!16?@F^!8~{o!10~!9o!13?!10~-!24?_ow{}!35~??!19~x!18?!10~?CK[!4{}!9~^B??N!8~x!21?!10~N^^!18~}{o!10~!22?!29~!13?!10~-!18?_ow{}!8~^NFB@?!11~^NFB@?!10~??!10~F!9~}{wo__!12?!10~!5?@BFN^!9~}{wof^!7~}wo__!11?__o{!9~N@!7?!6@Bb!10~N!9~{o__!12?__o{}!8~F@!10~!9B!13?!10~-!12?_ow{}!8~^NFB@!7?!5~^NFB@!7?!10~??!10~??@FN^!20~??!10~!11?@BFN^!23~!7}!10~^NFB~!12}!12~^NB??BFN^!9~!10}!9~^NF@???!10~!22?!5~^NFB@-!6?_ow{}!8~^NFB@!13?FFB@!13?!10F??!10F!7?@@BB!15F??!10F!17?@BFN^!10~|zrfFF!10NFFFBB@@!5?!21FBB@!11?@BBFFNNN!10^NNNFFBB@!8?!10~!22?NFB@-_ow{}!8~^NFB@!119?@BFN^!9~}{wo!88?!10~-!7~^NFB@!131?@BFN^!7~!88?!7~^NF-~^NFB@!143?@BFN^~!88?~^NFB@\

12
build.rs Normal file
View file

@ -0,0 +1,12 @@
fn main() {
if std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default() == "windows"
&& std::env::var("CARGO_CFG_TARGET_ENV").unwrap_or_default() == "msvc"
{
let path = std::path::absolute("src/edit.exe.manifest").unwrap();
let path = path.to_str().unwrap();
println!("cargo::rerun-if-changed=src/edit.exe.manifest");
println!("cargo::rustc-link-arg-bin=edit=/MANIFEST:EMBED");
println!("cargo::rustc-link-arg-bin=edit=/MANIFESTINPUT:{}", path);
println!("cargo::rustc-link-arg-bin=edit=/WX");
}
}

80
src/apperr.rs Normal file
View file

@ -0,0 +1,80 @@
use crate::sys;
use std::num::NonZeroU32;
use std::{fmt, io, result};
// Remember to add an entry to `Error::message()` for each new error.
pub const APP_ICU_MISSING: Error = Error::new_app(1);
pub const APP_FILE_NOT_FOUND: Error = Error::new_app(2);
pub type Result<T> = result::Result<T, Error>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Error(NonZeroU32);
impl Error {
const FLAGS_MASK: u32 = 0xF8000000; // Top 5 bits
const FLAGS_CUSTOM_FAILURE: u32 = 0xA0000000;
const TAG_APP: u32 = Self::FLAGS_CUSTOM_FAILURE | (1 << 16);
const TAG_ICU: u32 = Self::FLAGS_CUSTOM_FAILURE | (2 << 16);
pub const unsafe fn new(code: u32) -> Self {
Error(unsafe { NonZeroU32::new_unchecked(code) })
}
pub const fn new_app(code: u32) -> Self {
debug_assert!(code > 0 && code <= 0xFFFF);
unsafe { Self::new(Self::TAG_APP | code) }
}
pub const fn new_icu(code: u32) -> Self {
debug_assert!(code > 0 && code <= 0xFFFF);
unsafe { Self::new(Self::TAG_ICU | code) }
}
pub fn is_app(&self) -> bool {
(self.0.get() & 0xFFFF0000) == Self::TAG_APP
}
pub fn is_icu(&self) -> bool {
(self.0.get() & 0xFFFF0000) == Self::TAG_ICU
}
pub fn code(&self) -> u32 {
self.0.get() & 0xFFFF
}
pub fn value(&self) -> u32 {
self.0.get()
}
pub fn message(self) -> String {
match self {
APP_ICU_MISSING => "ICU not found".to_string(),
APP_FILE_NOT_FOUND => "File not found".to_string(),
_ => {
debug_assert!(!self.is_app());
if self.is_icu() {
format!("ICU Error {:#08x}", self.code())
} else {
sys::format_error(self)
}
}
}
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{:#08x}", self.0)
}
}
impl From<io::Error> for Error {
fn from(err: io::Error) -> Self {
match err.kind() {
io::ErrorKind::NotFound => APP_FILE_NOT_FOUND,
_ => sys::io_error_to_apperr(err),
}
}
}

2299
src/buffer.rs Normal file

File diff suppressed because it is too large Load diff

22
src/edit.exe.manifest Normal file
View file

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<assembly
xmlns="urn:schemas-microsoft-com:asm.v1"
xmlns:asmv3="urn:schemas-microsoft-com:asm.v3"
xmlns:cv1="urn:schemas-microsoft-com:compatibility.v1"
xmlns:ws2="http://schemas.microsoft.com/SMI/2016/WindowsSettings"
xmlns:ws3="http://schemas.microsoft.com/SMI/2019/WindowsSettings"
xmlns:ws4="http://schemas.microsoft.com/SMI/2020/WindowsSettings"
manifestVersion="1.0">
<asmv3:application>
<windowsSettings>
<ws2:longPathAware>true</ws2:longPathAware>
<ws3:activeCodePage>UTF-8</ws3:activeCodePage>
<ws4:heapType>SegmentHeap</ws4:heapType>
</windowsSettings>
</asmv3:application>
<cv1:compatibility>
<application>
<supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
</application>
</cv1:compatibility>
</assembly>

536
src/framebuffer.rs Normal file
View file

@ -0,0 +1,536 @@
use crate::helpers::{CoordType, Point, Rect, Size};
use crate::{helpers, ucd};
use std::fmt::Write;
pub enum IndexedColor {
Black,
Red,
Green,
Yellow,
Blue,
Magenta,
Cyan,
White,
BrightBlack,
BrightRed,
BrightGreen,
BrightYellow,
BrightBlue,
BrightMagenta,
BrightCyan,
BrightWhite,
DefaultBackground,
DefaultForeground,
}
pub const INDEXED_COLORS_COUNT: usize = 18;
pub const DEFAULT_THEME: [u32; INDEXED_COLORS_COUNT] = [
0xff000000, 0xff212cbe, 0xff3aae3f, 0xff4a9abe, 0xffbe4d20, 0xffbe54bb, 0xffb2a700, 0xffbebebe,
0xff808080, 0xff303eff, 0xff51ea58, 0xff44c9ff, 0xffff6a2f, 0xffff74fc, 0xfff0e100, 0xffffffff,
0xff000000, 0xffffffff,
];
pub struct Framebuffer {
indexed_colors: [u32; INDEXED_COLORS_COUNT],
size: Size,
lines: Vec<String>,
bg_bitmap: Vec<u32>,
fg_bitmap: Vec<u32>,
auto_colors: [u32; 2], // [dark, light]
cursor: Point,
cursor_overtype: bool,
}
impl Framebuffer {
pub fn new() -> Self {
Self {
indexed_colors: DEFAULT_THEME,
size: Size::default(),
lines: Vec::new(),
bg_bitmap: Vec::new(),
fg_bitmap: Vec::new(),
auto_colors: [0, 0],
cursor: Point { x: -1, y: -1 },
cursor_overtype: false,
}
}
pub fn set_indexed_colors(&mut self, colors: [u32; INDEXED_COLORS_COUNT]) {
self.indexed_colors = colors;
self.auto_colors = [
self.indexed_colors[IndexedColor::Black as usize],
self.indexed_colors[IndexedColor::BrightWhite as usize],
];
if !Self::quick_is_dark(self.auto_colors[0]) {
self.auto_colors.swap(0, 1);
}
}
pub fn reset(&mut self, size: Size) {
let width = size.width as usize;
if size != self.size {
let height = size.height as usize;
let area = width * height;
self.size = size;
self.lines = vec![String::new(); height];
self.bg_bitmap = vec![0; area];
self.fg_bitmap = vec![0; area];
}
let bg = self.indexed_colors[IndexedColor::DefaultBackground as usize];
self.bg_bitmap.fill(bg);
self.fg_bitmap.fill(0);
self.cursor = Point { x: -1, y: -1 };
for l in &mut self.lines {
l.clear();
l.reserve(width + width / 2);
helpers::string_append_repeat(l, ' ', width);
}
}
/// Replaces text contents in a single line of the framebuffer.
/// All coordinates are in viewport coordinates.
/// Assumes that all tabs have been replaced with spaces.
///
/// # Arguments
///
/// * `y` - The y-coordinate of the line to replace.
/// * `origin_x` - The x-coordinate where the text should be inserted.
/// * `clip_right` - The x-coordinate past which the text will be clipped.
/// * `text` - The text to insert.
///
/// # Returns
///
/// The rectangle that was updated.
pub fn replace_text(
&mut self,
y: CoordType,
origin_x: CoordType,
clip_right: CoordType,
text: &str,
) -> Rect {
let Some(line) = self.lines.get_mut(y as usize) else {
return Rect::default();
};
let bytes = text.as_bytes();
let clip_right = clip_right.clamp(0, self.size.width);
let layout_width = clip_right - origin_x;
// Can't insert text that can't fit or is empty.
if layout_width <= 0 || bytes.is_empty() {
return Rect::default();
}
let mut cfg = ucd::MeasurementConfig::new(&bytes);
// Check if the text intersects with the left edge of the framebuffer
// and figure out the parts that are inside.
let mut left = origin_x;
if left < 0 {
let cursor = cfg.goto_visual(Point { x: -left, y: 0 });
left += cursor.visual_pos.x;
if left < 0 && cursor.offset < text.len() {
// `-left` must've intersected a wide glyph. Go to the next one.
let cursor = cfg.goto_logical(Point {
x: cursor.logical_pos.x + 1,
y: 0,
});
left += cursor.visual_pos.x;
}
}
// If the text still starts outside the framebuffer, we must've ran out of text above.
// Otherwise, if it starts outside the right edge to begin with, we can't insert it anyway.
if left < 0 || left >= clip_right {
return Rect::default();
}
// Measure the width of the new text (= `res_new.visual_target.x`).
let res_new = cfg.goto_visual(Point {
x: layout_width,
y: 0,
});
// Figure out at which byte offset the new text gets inserted.
let right = left + res_new.visual_pos.x;
let line_bytes = line.as_bytes();
let mut cfg_old = ucd::MeasurementConfig::new(&line_bytes);
let res_old_beg = cfg_old.goto_visual(Point { x: left, y: 0 });
let res_old_end = cfg_old.goto_visual(Point { x: right, y: 0 });
// If we intersect a wide glyph, we need to pad the new text with spaces.
let mut str_new = &text[..res_new.offset];
let mut str_buf = String::new();
let overlap_beg = res_old_beg.visual_pos.x - left;
let overlap_end = right - res_old_end.visual_pos.x;
if overlap_beg > 0 || overlap_end > 0 {
if overlap_beg > 0 {
helpers::string_append_repeat(&mut str_buf, ' ', overlap_beg as usize);
}
str_buf.push_str(str_new);
if overlap_end > 0 {
helpers::string_append_repeat(&mut str_buf, ' ', overlap_end as usize);
}
str_new = &str_buf;
}
(*line).replace_range(res_old_beg.offset..res_old_end.offset, str_new);
Rect {
left,
top: y,
right,
bottom: y + 1,
}
}
pub fn draw_scrollbar(
&mut self,
clip_rect: Rect,
track: Rect,
content_offset: CoordType,
content_height: CoordType,
) {
if track.is_empty() {
return;
}
let viewport_height = track.height();
// The content height is at least the viewport height.
let content_height = content_height.max(viewport_height);
// The content offset must be at least one viewport height from the bottom.
// You don't want to scroll past the end after all...
let content_offset = content_offset.clamp(0, content_height - viewport_height);
// In order to increase the visual resolution of the scrollbar,
// we'll use 1/8th blocks to represent the thumb.
// First, scale the offsets to get that 1/8th resolution.
let viewport_height = viewport_height as i64 * 8;
let content_offset = content_offset as i64 * 8;
let content_height = content_height as i64 * 8;
// The proportional thumb height (0-1) is the fraction of viewport and
// content height. The taller the content, the smaller the thumb:
// = viewport_height / content_height
//
// We then scale that to the viewport height to get the height in 1/8th units.
// = viewport_height * viewport_height / content_height
//
// We add content_height/2 to round the integer division, which results in a numerator of:
// = viewport_height * viewport_height + content_height / 2
//
// Finally we add +1 to round up the division if `content_height` is uneven. This ensures that
// in case of a rounding issue, we'll make the track too large and clamp it to the track size.
let thumb_numerator = viewport_height * viewport_height + content_height / 2 + 1;
let thumb_height = thumb_numerator / content_height;
// Ensure the thumb has a minimum size of 1 row.
let thumb_height = thumb_height.max(8);
// The proportional thumb top position (0-1) is naturally:
// = content_offset / content_height
//
// The bottom position is 1 viewport-height below the top position:
// = (viewport_height + content_offset) / content_height
//
// Since everything must be scaled to the 1/8th units we must multiply by viewport_height:
// = viewport_height * (viewport_height + content_offset) / content_height
// = viewport_height * viewport_height + viewport_height * content_offset / content_height
//
// And we also want that rounded integer division as before. This transforms the
// `viewport_height * viewport_height` portion into the `thumb_enumerator` above.
// = thumb_numerator + viewport_height * content_offset / content_height
//
let thumb_bottom = (viewport_height * content_offset + thumb_numerator) / content_height;
// Now that the bottom is flush with the bottom of the track, we can calculate the top.
let thumb_top = (thumb_bottom - thumb_height).max(0);
// Calculate the height of the top/bottom cell of the thumb.
let top_fract = (thumb_top % 8) as CoordType;
let bottom_fract = (thumb_bottom % 8) as CoordType;
// Shift to absolute coordinates.
let thumb_top = ((thumb_top + 7) / 8) as CoordType + track.top;
let thumb_bottom = (thumb_bottom / 8) as CoordType + track.top;
let track_clipped = track.intersect(clip_rect);
// Clamp to the visible area.
let thumb_top_clipped = thumb_top.max(track_clipped.top);
let thumb_bottom_clipped = thumb_bottom.min(track_clipped.bottom);
self.blend_bg(track_clipped, self.indexed(IndexedColor::BrightBlack));
self.blend_fg(track_clipped, self.indexed(IndexedColor::BrightWhite));
// Draw the full blocks.
for y in thumb_top_clipped..thumb_bottom_clipped {
self.replace_text(y, track_clipped.left, track_clipped.right, "");
}
// Draw the top/bottom cell of the thumb.
// U+2581 to U+2588, 1/8th block to 8/8th block elements glyphs: ▁▂▃▄▅▆▇█
// In UTF8: E2 96 81 to E2 96 88
let mut fract_buf = [0xE2, 0x96, 0x88];
if top_fract != 0 {
fract_buf[2] = (0x88 - top_fract) as u8;
self.replace_text(
thumb_top_clipped - 1,
track_clipped.left,
track_clipped.right,
unsafe { std::str::from_utf8_unchecked(&fract_buf) },
);
}
if bottom_fract != 0 {
fract_buf[2] = (0x88 - bottom_fract) as u8;
let rect = self.replace_text(
thumb_bottom_clipped,
track_clipped.left,
track_clipped.right,
unsafe { std::str::from_utf8_unchecked(&fract_buf) },
);
self.blend_bg(rect, self.indexed(IndexedColor::BrightWhite));
self.blend_fg(rect, self.indexed(IndexedColor::BrightBlack));
}
}
#[inline]
pub fn indexed(&self, index: IndexedColor) -> u32 {
self.indexed_colors[index as usize]
}
/// Blends a background color over the given rectangular area.
pub fn blend_bg(&mut self, target: Rect, bg: u32) {
Self::alpha_blend_rect(&mut self.bg_bitmap[..], target, self.size, bg);
}
/// Blends a foreground color over the given rectangular area.
pub fn blend_fg(&mut self, target: Rect, fg: u32) {
if fg != 0 {
Self::alpha_blend_rect(&mut self.fg_bitmap[..], target, self.size, fg);
} else {
self.blend_rect_auto(target);
}
}
/// Performs alpha blending on a rectangle inside the destination bitmap.
fn alpha_blend_rect(dst: &mut [u32], rect: Rect, size: Size, src: u32) {
let width = size.width;
let height = size.height;
let left = rect.left.clamp(0, width);
let right = rect.right.clamp(0, width);
let top = rect.top.clamp(0, height);
let bottom = rect.bottom.clamp(0, height);
if left >= right || top >= bottom {
return;
}
if (src & 0xff000000) == 0xff000000 {
for y in top..bottom {
let beg = (y * width + left) as usize;
let end = (y * width + right) as usize;
dst[beg..end].fill(src);
}
} else if (src & 0xff000000) != 0x00000000 {
for y in top..bottom {
let beg = (y * width + left) as usize;
let end = (y * width + right) as usize;
let mut off = beg;
while {
let color = dst[off];
// Chunk into runs of the same color, so that we only call alpha_blend once per run.
let chunk_beg = off;
while {
off += 1;
off < end && dst[off] == color
} {}
let chunk_end = off;
let color = Self::mix(color, src, 1.0, 1.0);
dst[chunk_beg..chunk_end].fill(color);
off < end
} {}
}
}
}
fn blend_rect_auto(&mut self, rect: Rect) {
let width = self.size.width;
let height = self.size.height;
let left = rect.left.clamp(0, width);
let right = rect.right.clamp(0, width);
let top = rect.top.clamp(0, height);
let bottom = rect.bottom.clamp(0, height);
if left >= right || top >= bottom {
return;
}
for y in top..bottom {
let beg = (y * width + left) as usize;
let end = (y * width + right) as usize;
let mut off = beg;
while {
let bg = self.bg_bitmap[off];
// Chunk into runs of the same color, so that we only call Self::quick_is_dark once per run.
let chunk_beg = off;
while {
off += 1;
off < end && self.bg_bitmap[off] == bg
} {}
let chunk_end = off;
let fg = self.auto_colors[Self::quick_is_dark(bg) as usize];
self.fg_bitmap[chunk_beg..chunk_end].fill(fg);
off < end
} {}
}
}
fn mix(dst: u32, src: u32, dst_balance: f32, src_balance: f32) -> u32 {
let src_r = Self::srgb_to_linear(src & 0xff);
let src_g = Self::srgb_to_linear((src >> 8) & 0xff);
let src_b = Self::srgb_to_linear((src >> 16) & 0xff);
let src_a = (src >> 24) as f32 / 255.0f32;
let src_a = src_a * dst_balance;
let dst_r = Self::srgb_to_linear(dst & 0xff);
let dst_g = Self::srgb_to_linear((dst >> 8) & 0xff);
let dst_b = Self::srgb_to_linear((dst >> 16) & 0xff);
let dst_a = (dst >> 24) as f32 / 255.0f32;
let dst_a = dst_a * src_balance;
let out_a = src_a + dst_a * (1.0f32 - src_a);
let out_r = (src_r * src_a + dst_r * dst_a * (1.0f32 - src_a)) / out_a;
let out_g = (src_g * src_a + dst_g * dst_a * (1.0f32 - src_a)) / out_a;
let out_b = (src_b * src_a + dst_b * dst_a * (1.0f32 - src_a)) / out_a;
(((out_a * 255.0f32) as u32) << 24)
| (Self::linear_to_srgb(out_b) << 16)
| (Self::linear_to_srgb(out_g) << 8)
| Self::linear_to_srgb(out_r)
}
fn srgb_to_linear(c: u32) -> f32 {
let fc = c as f32 / 255.0f32;
if fc <= 0.04045f32 {
fc / 12.92f32
} else {
((fc + 0.055f32) / 1.055f32).powf(2.4f32)
}
}
fn linear_to_srgb(c: f32) -> u32 {
if c <= 0.0031308f32 {
(c * 12.92f32 * 255.0f32) as u32
} else {
((1.055f32 * c.powf(1.0f32 / 2.4f32) - 0.055f32) * 255.0f32) as u32
}
}
fn quick_is_dark(c: u32) -> bool {
let r = c & 0xff;
let g = (c >> 8) & 0xff;
let b = (c >> 16) & 0xff;
// Rough approximation of the sRGB luminance Y = 0.2126 R + 0.7152 G + 0.0722 B.
let l = r * 3 + g * 10 + b;
l < 128 * 14
}
pub fn set_cursor(&mut self, pos: Point, overtype: bool) {
self.cursor = pos;
self.cursor_overtype = overtype;
}
pub fn render(&mut self) -> String {
let mut result = String::new();
result.push_str("\x1b[H");
let mut last_bg = self.bg_bitmap[0];
let mut last_fg = self.fg_bitmap[0];
// Invert the colors to force a color change on the first cell.
last_bg ^= 1;
last_fg ^= 1;
for y in 0..self.size.height {
if y != 0 {
result.push_str("\r\n");
}
let line = &self.lines[y as usize][..];
let line_bytes = line.as_bytes();
let mut cfg = ucd::MeasurementConfig::new(&line_bytes);
for x in 0..self.size.width {
let bg = self.bg_bitmap[(y * self.size.width + x) as usize];
let fg = self.fg_bitmap[(y * self.size.width + x) as usize];
if bg == last_bg && fg == last_fg {
continue;
}
if x != 0 {
let beg = cfg.cursor().offset;
let end = cfg.goto_visual(Point { x, y: 0 }).offset;
result.push_str(&line[beg..end]);
}
if last_bg != bg {
last_bg = bg;
_ = write!(
result,
"\x1b[48;2;{};{};{}m",
bg & 0xff,
(bg >> 8) & 0xff,
(bg >> 16) & 0xff
);
}
if last_fg != fg {
last_fg = fg;
_ = write!(
result,
"\x1b[38;2;{};{};{}m",
fg & 0xff,
(fg >> 8) & 0xff,
(fg >> 16) & 0xff
);
}
}
result.push_str(&line[cfg.cursor().offset..]);
}
if self.cursor.x >= 0 && self.cursor.y >= 0 {
// CUP to the cursor position.
// DECSCUSR to set the cursor style.
// DECTCEM to show the cursor.
_ = write!(
result,
"\x1b[{};{}H\x1b[{} q\x1b[?25h",
self.cursor.y + 1,
self.cursor.x + 1,
if self.cursor_overtype { 1 } else { 5 }
);
} else {
// DECTCEM to hide the cursor.
result.push_str("\x1b[?25l");
}
result
}
}
pub fn mix(dst: u32, src: u32, balance: f32) -> u32 {
Framebuffer::mix(dst, src, 1.0 - balance, balance)
}

234
src/fuzzy.rs Normal file
View file

@ -0,0 +1,234 @@
//! Fuzzy search algorithm based on the one used in VS Code (`/src/vs/base/common/fuzzyScorer.ts`).
//! Other algorithms exist, such as Sublime Text's, or the one used in `fzf`,
//! but I figured that this one is what lots of people may be familiar with.
use crate::icu;
pub type FuzzyScore = (i32, Vec<usize>);
const NO_MATCH: i32 = 0;
const NO_SCORE: FuzzyScore = (NO_MATCH, Vec::new());
pub fn score_fuzzy(target: &str, query: &str, allow_non_contiguous_matches: bool) -> FuzzyScore {
if target.is_empty() || query.is_empty() {
return NO_SCORE; // return early if target or query are empty
}
let target_lower = icu::fold_case(target);
let query_lower = icu::fold_case(query);
let target: Vec<char> = target.chars().collect();
let target_lower: Vec<char> = target_lower.chars().collect();
let query: Vec<char> = query.chars().collect();
let query_lower: Vec<char> = query_lower.chars().collect();
if target.len() < query.len() {
return NO_SCORE; // impossible for query to be contained in target
}
do_score_fuzzy(
&query,
&query_lower,
&target,
&target_lower,
allow_non_contiguous_matches,
)
}
fn do_score_fuzzy(
query: &[char],
query_lower: &[char],
target: &[char],
target_lower: &[char],
allow_non_contiguous_matches: bool,
) -> FuzzyScore {
let mut scores = vec![0; query.len() * target.len()];
let mut matches = vec![0; query.len() * target.len()];
//
// Build Scorer Matrix:
//
// The matrix is composed of query q and target t. For each index we score
// q[i] with t[i] and compare that with the previous score. If the score is
// equal or larger, we keep the match. In addition to the score, we also keep
// the length of the consecutive matches to use as boost for the score.
//
// t a r g e t
// q
// u
// e
// r
// y
//
for query_index in 0..query.len() {
let query_index_offset = query_index * target.len();
let query_index_previous_offset = if query_index > 0 {
(query_index - 1) * target.len()
} else {
0
};
for target_index in 0..target.len() {
let current_index = query_index_offset + target_index;
let diag_index = if query_index > 0 && target_index > 0 {
query_index_previous_offset + target_index - 1
} else {
0
};
let left_score = if target_index > 0 {
scores[current_index - 1]
} else {
0
};
let diag_score = if query_index > 0 && target_index > 0 {
scores[diag_index]
} else {
0
};
let matches_sequence_len = if query_index > 0 && target_index > 0 {
matches[diag_index]
} else {
0
};
// If we are not matching on the first query character any more, we only produce a
// score if we had a score previously for the last query index (by looking at the diagScore).
// This makes sure that the query always matches in sequence on the target. For example
// given a target of "ede" and a query of "de", we would otherwise produce a wrong high score
// for query[1] ("e") matching on target[0] ("e") because of the "beginning of word" boost.
let score = if diag_score == 0 && query_index != 0 {
0
} else {
compute_char_score(
query[query_index],
query_lower[query_index],
if target_index != 0 {
Some(target[target_index - 1])
} else {
None
},
target[target_index],
target_lower[target_index],
matches_sequence_len,
)
};
// We have a score and its equal or larger than the left score
// Match: sequence continues growing from previous diag value
// Score: increases by diag score value
let is_valid_score = score != 0 && diag_score + score >= left_score;
if is_valid_score
&& (
// We don't need to check if it's contiguous if we allow non-contiguous matches
allow_non_contiguous_matches ||
// We must be looking for a contiguous match.
// Looking at an index higher than 0 in the query means we must have already
// found out this is contiguous otherwise there wouldn't have been a score
query_index > 0 ||
// lastly check if the query is completely contiguous at this index in the target
target_lower[target_index..].starts_with(&query_lower)
)
{
matches[current_index] = matches_sequence_len + 1;
scores[current_index] = diag_score + score;
} else {
// We either have no score or the score is lower than the left score
// Match: reset to 0
// Score: pick up from left hand side
matches[current_index] = NO_MATCH;
scores[current_index] = left_score;
}
}
}
// Restore Positions (starting from bottom right of matrix)
let mut positions = Vec::new();
if query.len() != 0 && target.len() != 0 {
let mut query_index = query.len() - 1;
let mut target_index = target.len() - 1;
loop {
let current_index = query_index * target.len() + target_index;
if matches[current_index] == NO_MATCH {
if target_index == 0 {
break;
}
target_index -= 1; // go left
} else {
positions.push(target_index);
// go up and left
if query_index == 0 || target_index == 0 {
break;
}
query_index -= 1;
target_index -= 1;
}
}
positions.reverse();
}
(scores[query.len() * target.len() - 1], positions)
}
fn compute_char_score(
query: char,
query_lower: char,
target_prev: Option<char>,
target_curr: char,
target_curr_lower: char,
matches_sequence_len: i32,
) -> i32 {
let mut score = 0;
if !consider_as_equal(query_lower, target_curr_lower) {
return score; // no match of characters
}
// Character match bonus
score += 1;
// Consecutive match bonus
if matches_sequence_len > 0 {
score += matches_sequence_len * 5;
}
// Same case bonus
if query == target_curr {
score += 1;
}
if let Some(target_prev) = target_prev {
// After separator bonus
let separator_bonus = score_separator_at_pos(target_prev);
if separator_bonus > 0 {
score += separator_bonus;
}
// Inside word upper case bonus (camel case). We only give this bonus if we're not in a contiguous sequence.
// For example:
// NPE => NullPointerException = boost
// HTTP => HTTP = not boost
else if target_curr != target_curr_lower && matches_sequence_len == 0 {
score += 2;
}
} else {
// Start of word bonus
score += 8;
}
score
}
fn consider_as_equal(a: char, b: char) -> bool {
// Special case path separators: ignore platform differences
a == b || a == '/' || a == '\\' && b == '/' || b == '\\'
}
fn score_separator_at_pos(ch: char) -> i32 {
match ch {
'/' | '\\' => 5, // prefer path separators...
'_' | '-' | '.' | ' ' | '\'' | '"' | ':' => 4, // ...over other separators
_ => 0,
}
}

382
src/helpers.rs Normal file
View file

@ -0,0 +1,382 @@
use std::borrow::Cow;
use std::cmp::Ordering;
use std::ffi::{CStr, CString, OsStr, OsString, c_char};
use std::mem;
use std::path::{Path, PathBuf};
use std::slice;
use std::str;
pub type CoordType = i32;
pub const COORD_TYPE_MIN: CoordType = -2147483647 - 1;
pub const COORD_TYPE_MAX: CoordType = 2147483647;
pub const COORD_TYPE_SAFE_MIN: CoordType = -32767 - 1;
pub const COORD_TYPE_SAFE_MAX: CoordType = 32767;
#[derive(Clone, Copy, PartialEq, Eq, Default, Debug)]
pub struct Point {
pub x: CoordType,
pub y: CoordType,
}
impl Point {
pub const MIN: Point = Point {
x: COORD_TYPE_MIN,
y: COORD_TYPE_MIN,
};
pub const MAX: Point = Point {
x: COORD_TYPE_MAX,
y: COORD_TYPE_MAX,
};
}
impl PartialOrd<Point> for Point {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Point {
fn cmp(&self, other: &Self) -> Ordering {
match self.y.cmp(&other.y) {
Ordering::Equal => self.x.cmp(&other.x),
ord => ord,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq, Default, Debug)]
pub struct Size {
pub width: CoordType,
pub height: CoordType,
}
impl Size {
pub fn as_rect(&self) -> Rect {
Rect {
left: 0,
top: 0,
right: self.width,
bottom: self.height,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq, Default, Debug)]
pub struct Rect {
pub left: CoordType,
pub top: CoordType,
pub right: CoordType,
pub bottom: CoordType,
}
impl Rect {
pub fn one(value: CoordType) -> Self {
Self {
left: value,
top: value,
right: value,
bottom: value,
}
}
pub fn two(top_bottom: CoordType, left_right: CoordType) -> Self {
Self {
left: left_right,
top: top_bottom,
right: left_right,
bottom: top_bottom,
}
}
pub fn three(top: CoordType, left_right: CoordType, bottom: CoordType) -> Self {
Self {
left: left_right,
top,
right: left_right,
bottom,
}
}
pub fn is_empty(&self) -> bool {
self.left >= self.right || self.top >= self.bottom
}
pub fn width(&self) -> CoordType {
self.right - self.left
}
pub fn height(&self) -> CoordType {
self.bottom - self.top
}
pub fn contains(&self, point: Point) -> bool {
point.x >= self.left && point.x < self.right && point.y >= self.top && point.y < self.bottom
}
pub fn intersect(&self, rhs: Self) -> Self {
let l = self.left.max(rhs.left);
let t = self.top.max(rhs.top);
let r = self.right.min(rhs.right);
let b = self.bottom.min(rhs.bottom);
// Ensure that the size is non-negative. This avoids bugs,
// because some height/width is negative all of a sudden.
let r = l.max(r);
let b = t.max(b);
Rect {
left: l,
top: t,
right: r,
bottom: b,
}
}
}
unsafe fn wyr3(p: *const u8, k: usize) -> u64 {
let p0 = unsafe { p.read() as u64 };
let p1 = unsafe { p.add(k >> 1).read() as u64 };
let p2 = unsafe { p.add(k - 1).read() as u64 };
(p0 << 16) | (p1 << 8) | p2
}
unsafe fn wyr4(p: *const u8) -> u64 {
unsafe { (p as *const u32).read_unaligned() as u64 }
}
unsafe fn wyr8(p: *const u8) -> u64 {
unsafe { (p as *const u64).read_unaligned() }
}
// This is a weak mix function on its own. It may be worth considering
// replacing external uses of this function with a stronger one.
// On the other hand, it's very fast.
pub fn wymix(lhs: u64, rhs: u64) -> u64 {
let lhs = lhs as u128;
let rhs = rhs as u128;
let r = lhs * rhs;
(r >> 64) as u64 ^ (r as u64)
}
// The venerable wyhash hash function. It's fast and has good statistical properties.
// It's in the public domain.
pub fn hash(mut seed: u64, data: &[u8]) -> u64 {
unsafe {
const S0: u64 = 0xa0761d6478bd642f;
const S1: u64 = 0xe7037ed1a0b428db;
const S2: u64 = 0x8ebc6af09c88c6e3;
const S3: u64 = 0x589965cc75374cc3;
let len = data.len();
let mut p = data.as_ptr();
let a;
let b;
seed ^= S0;
if len <= 16 {
if len >= 4 {
a = (wyr4(p) << 32) | wyr4(p.add((len >> 3) << 2));
b = (wyr4(p.add(len - 4)) << 32) | wyr4(p.add(len - 4 - ((len >> 3) << 2)));
} else if len > 0 {
a = wyr3(p, len);
b = 0;
} else {
a = 0;
b = 0;
}
} else {
let mut i = len;
if i > 48 {
let mut seed1 = seed;
let mut seed2 = seed;
while {
seed = wymix(wyr8(p) ^ S1, wyr8(p.add(8)) ^ seed);
seed1 = wymix(wyr8(p.add(16)) ^ S2, wyr8(p.add(24)) ^ seed1);
seed2 = wymix(wyr8(p.add(32)) ^ S3, wyr8(p.add(40)) ^ seed2);
p = p.add(48);
i -= 48;
i > 48
} {}
seed ^= seed1 ^ seed2;
}
while i > 16 {
seed = wymix(wyr8(p) ^ S1, wyr8(p.add(8)) ^ seed);
i -= 16;
p = p.add(16);
}
a = wyr8(p.offset(i as isize - 16));
b = wyr8(p.offset(i as isize - 8));
}
wymix(S1 ^ (len as u64), wymix(a ^ S1, b ^ seed))
}
}
pub fn hash_str(seed: u64, s: &str) -> u64 {
hash(seed, s.as_bytes())
}
pub fn string_append_repeat(dst: &mut String, ch: char, total_copies: usize) {
if total_copies == 0 {
return;
}
let buf = unsafe { dst.as_mut_vec() };
if ch.is_ascii() {
// Compiles down to `memset()`.
buf.extend(std::iter::repeat_n(ch as u8, total_copies));
} else {
// Implements efficient string padding using quadratic duplication.
let mut utf8_buf = [0; 4];
let utf8 = ch.encode_utf8(&mut utf8_buf).as_bytes();
let initial_len = buf.len();
let added_len = utf8.len() * total_copies;
let final_len = initial_len + added_len;
buf.reserve(added_len);
buf.extend_from_slice(utf8);
while buf.len() != final_len {
let end = (final_len - buf.len() + initial_len).min(buf.len());
buf.extend_from_within(initial_len..end);
}
}
}
/// `std::cmp::minmax` is unstable, as per usual.
pub fn minmax<T>(v1: T, v2: T) -> [T; 2]
where
T: Ord,
{
if v2 < v1 { [v2, v1] } else { [v1, v2] }
}
pub struct DisplayablePathBuf {
value: PathBuf,
str: Cow<'static, str>,
}
impl DisplayablePathBuf {
pub fn new(value: PathBuf) -> Self {
let str = value.to_string_lossy();
let str = unsafe { mem::transmute(str) };
Self { value, str }
}
pub fn as_path(&self) -> &Path {
&self.value
}
pub fn as_str(&self) -> &str {
&self.str
}
pub fn as_bytes(&self) -> &[u8] {
self.value.as_os_str().as_encoded_bytes()
}
pub fn clone_path_buf(&self) -> PathBuf {
self.value.clone()
}
pub fn take(self) -> PathBuf {
self.value
}
}
impl Default for DisplayablePathBuf {
fn default() -> Self {
Self {
value: PathBuf::default(),
str: Cow::Borrowed(""),
}
}
}
impl Clone for DisplayablePathBuf {
fn clone(&self) -> Self {
DisplayablePathBuf::new(self.value.clone())
}
}
impl From<OsString> for DisplayablePathBuf {
fn from(s: OsString) -> DisplayablePathBuf {
DisplayablePathBuf::new(PathBuf::from(s))
}
}
impl<T: ?Sized + AsRef<OsStr>> From<&T> for DisplayablePathBuf {
fn from(s: &T) -> DisplayablePathBuf {
DisplayablePathBuf::new(PathBuf::from(s))
}
}
pub struct DisplayableCString {
value: CString,
str: Cow<'static, str>,
}
impl DisplayableCString {
pub fn new(value: CString) -> Self {
let str = value.to_string_lossy();
let str = unsafe { mem::transmute(str) };
Self { value, str }
}
pub unsafe fn from_ptr(ptr: *const c_char) -> Self {
let s = unsafe { CStr::from_ptr(ptr) };
Self::new(s.to_owned())
}
pub fn as_cstr(&self) -> &CStr {
&self.value
}
pub fn as_str(&self) -> &str {
&self.str
}
}
#[inline]
#[must_use]
pub const unsafe fn str_from_raw_parts<'a>(ptr: *const u8, len: usize) -> &'a str {
unsafe { str::from_utf8_unchecked(slice::from_raw_parts(ptr, len)) }
}
#[inline]
#[must_use]
pub const unsafe fn str_from_raw_parts_mut<'a>(ptr: *mut u8, len: usize) -> &'a mut str {
unsafe { str::from_utf8_unchecked_mut(slice::from_raw_parts_mut(ptr, len)) }
}
pub fn vec_insert_at<T: Copy>(dst: &mut Vec<T>, off: usize, src: &[T]) {
unsafe {
let dst_len = dst.len();
let src_len = src.len();
// Make room for the new elements. NOTE that this must be done before
// we call as_mut_ptr, or else we risk accessing a stale pointer.
dst.reserve(src_len);
let off = off.min(dst_len);
let ptr = dst.as_mut_ptr().add(off);
if off < dst_len {
// Move the tail of the vector to make room for the new elements.
std::ptr::copy(ptr, ptr.add(src_len), dst_len - off);
}
// Copy the new elements into the vector.
std::ptr::copy_nonoverlapping(src.as_ptr(), ptr, src_len);
// Update the length of the vector.
dst.set_len(dst_len + src_len);
}
}
// Works just like `std::hint::cold_path`, but it's stable.
#[cold]
#[inline(always)]
pub const fn cold_path() {}

1027
src/icu.rs Normal file

File diff suppressed because it is too large Load diff

488
src/input.rs Normal file
View file

@ -0,0 +1,488 @@
use crate::helpers::{Point, Size};
use crate::vt;
// TODO: Is this a good idea? I did it to allow typing `kbmod::CTRL | vk::A`.
// The reason it's an awkard u32 and not a struct is to hopefully make ABIs easier later.
// Of course you could just translate on the ABI boundary, but my hope is that this
// design lets me realize some restrictions early on that I can't foresee yet.
#[repr(transparent)]
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct InputKey(u32);
impl InputKey {
pub const fn new(v: u32) -> Self {
Self(v)
}
pub const fn value(&self) -> u32 {
self.0
}
pub const fn key(&self) -> InputKey {
InputKey(self.0 & 0x00FFFFFF)
}
pub const fn modifiers(&self) -> InputKeyMod {
InputKeyMod(self.0 & 0xFF000000)
}
pub const fn modifiers_contains(&self, modifier: InputKeyMod) -> bool {
(self.0 & modifier.0) != 0
}
pub const fn with_modifiers(&self, modifiers: InputKeyMod) -> InputKey {
InputKey(self.0 | modifiers.0)
}
}
#[repr(transparent)]
#[derive(Clone, Copy, PartialEq, Eq)]
pub struct InputKeyMod(u32);
impl InputKeyMod {
const fn new(v: u32) -> Self {
Self(v)
}
pub const fn contains(&self, modifier: InputKeyMod) -> bool {
(self.0 & modifier.0) != 0
}
}
impl std::ops::BitOr<InputKeyMod> for InputKey {
type Output = InputKey;
fn bitor(self, rhs: InputKeyMod) -> InputKey {
InputKey(self.0 | rhs.0)
}
}
impl std::ops::BitOr<InputKey> for InputKeyMod {
type Output = InputKey;
fn bitor(self, rhs: InputKey) -> InputKey {
InputKey(self.0 | rhs.0)
}
}
impl std::ops::BitOrAssign for InputKeyMod {
fn bitor_assign(&mut self, rhs: Self) {
self.0 |= rhs.0;
}
}
// The codes defined here match the VK_* constants on Windows.
// It's a convenient way to handle keyboard input, even on other platforms.
pub mod vk {
use super::InputKey;
pub const NULL: InputKey = InputKey::new(0x00);
pub const BACK: InputKey = InputKey::new(0x08);
pub const TAB: InputKey = InputKey::new(0x09);
pub const RETURN: InputKey = InputKey::new(0x0D);
pub const ESCAPE: InputKey = InputKey::new(0x1B);
pub const SPACE: InputKey = InputKey::new(0x20);
pub const PRIOR: InputKey = InputKey::new(0x21);
pub const NEXT: InputKey = InputKey::new(0x22);
pub const END: InputKey = InputKey::new(0x23);
pub const HOME: InputKey = InputKey::new(0x24);
pub const LEFT: InputKey = InputKey::new(0x25);
pub const UP: InputKey = InputKey::new(0x26);
pub const RIGHT: InputKey = InputKey::new(0x27);
pub const DOWN: InputKey = InputKey::new(0x28);
pub const INSERT: InputKey = InputKey::new(0x2D);
pub const DELETE: InputKey = InputKey::new(0x2E);
pub const A: InputKey = InputKey::new('A' as u32);
pub const B: InputKey = InputKey::new('B' as u32);
pub const C: InputKey = InputKey::new('C' as u32);
pub const D: InputKey = InputKey::new('D' as u32);
pub const E: InputKey = InputKey::new('E' as u32);
pub const F: InputKey = InputKey::new('F' as u32);
pub const G: InputKey = InputKey::new('G' as u32);
pub const H: InputKey = InputKey::new('H' as u32);
pub const I: InputKey = InputKey::new('I' as u32);
pub const J: InputKey = InputKey::new('J' as u32);
pub const K: InputKey = InputKey::new('K' as u32);
pub const L: InputKey = InputKey::new('L' as u32);
pub const M: InputKey = InputKey::new('M' as u32);
pub const N: InputKey = InputKey::new('N' as u32);
pub const O: InputKey = InputKey::new('O' as u32);
pub const P: InputKey = InputKey::new('P' as u32);
pub const Q: InputKey = InputKey::new('Q' as u32);
pub const R: InputKey = InputKey::new('R' as u32);
pub const S: InputKey = InputKey::new('S' as u32);
pub const T: InputKey = InputKey::new('T' as u32);
pub const U: InputKey = InputKey::new('U' as u32);
pub const V: InputKey = InputKey::new('V' as u32);
pub const W: InputKey = InputKey::new('W' as u32);
pub const X: InputKey = InputKey::new('X' as u32);
pub const Y: InputKey = InputKey::new('Y' as u32);
pub const Z: InputKey = InputKey::new('Z' as u32);
pub const NUMPAD0: InputKey = InputKey::new(0x60);
pub const NUMPAD1: InputKey = InputKey::new(0x61);
pub const NUMPAD2: InputKey = InputKey::new(0x62);
pub const NUMPAD3: InputKey = InputKey::new(0x63);
pub const NUMPAD4: InputKey = InputKey::new(0x64);
pub const NUMPAD5: InputKey = InputKey::new(0x65);
pub const NUMPAD6: InputKey = InputKey::new(0x66);
pub const NUMPAD7: InputKey = InputKey::new(0x67);
pub const NUMPAD8: InputKey = InputKey::new(0x68);
pub const NUMPAD9: InputKey = InputKey::new(0x69);
pub const MULTIPLY: InputKey = InputKey::new(0x6A);
pub const ADD: InputKey = InputKey::new(0x6B);
pub const SEPARATOR: InputKey = InputKey::new(0x6C);
pub const SUBTRACT: InputKey = InputKey::new(0x6D);
pub const DECIMAL: InputKey = InputKey::new(0x6E);
pub const DIVIDE: InputKey = InputKey::new(0x6F);
pub const F1: InputKey = InputKey::new(0x70);
pub const F2: InputKey = InputKey::new(0x71);
pub const F3: InputKey = InputKey::new(0x72);
pub const F4: InputKey = InputKey::new(0x73);
pub const F5: InputKey = InputKey::new(0x74);
pub const F6: InputKey = InputKey::new(0x75);
pub const F7: InputKey = InputKey::new(0x76);
pub const F8: InputKey = InputKey::new(0x77);
pub const F9: InputKey = InputKey::new(0x78);
pub const F10: InputKey = InputKey::new(0x79);
pub const F11: InputKey = InputKey::new(0x7A);
pub const F12: InputKey = InputKey::new(0x7B);
pub const F13: InputKey = InputKey::new(0x7C);
pub const F14: InputKey = InputKey::new(0x7D);
pub const F15: InputKey = InputKey::new(0x7E);
pub const F16: InputKey = InputKey::new(0x7F);
pub const F17: InputKey = InputKey::new(0x80);
pub const F18: InputKey = InputKey::new(0x81);
pub const F19: InputKey = InputKey::new(0x82);
pub const F20: InputKey = InputKey::new(0x83);
pub const F21: InputKey = InputKey::new(0x84);
pub const F22: InputKey = InputKey::new(0x85);
pub const F23: InputKey = InputKey::new(0x86);
pub const F24: InputKey = InputKey::new(0x87);
}
pub mod kbmod {
use super::InputKeyMod;
pub const NONE: InputKeyMod = InputKeyMod::new(0x00000000);
pub const CTRL: InputKeyMod = InputKeyMod::new(0x01000000);
pub const ALT: InputKeyMod = InputKeyMod::new(0x02000000);
pub const SHIFT: InputKeyMod = InputKeyMod::new(0x04000000);
pub const CTRL_ALT: InputKeyMod = InputKeyMod::new(0x03000000);
pub const CTRL_SHIFT: InputKeyMod = InputKeyMod::new(0x05000000);
pub const ALT_SHIFT: InputKeyMod = InputKeyMod::new(0x06000000);
pub const CTRL_ALT_SHIFT: InputKeyMod = InputKeyMod::new(0x07000000);
}
#[derive(Clone, Copy)]
pub struct InputText<'a> {
pub text: &'a str,
pub bracketed: bool,
}
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Default)]
pub enum InputMouseState {
#[default]
None,
// These 3 carry their state between frames.
Left,
Middle,
Right,
// These 2 get reset to None on the next frame.
Release,
Scroll,
}
#[derive(Clone, Copy)]
pub struct InputMouse {
pub state: InputMouseState,
pub modifiers: InputKeyMod,
pub position: Point,
pub scroll: Point,
}
pub enum Input<'input> {
Resize(Size),
Text(InputText<'input>),
Keyboard(InputKey),
Mouse(InputMouse),
}
pub struct Parser {
want: bool,
buf: [u8; 3],
len: usize,
}
impl Parser {
pub fn new() -> Self {
Self {
want: false,
buf: [0; 3],
len: 0,
}
}
/// Turns VT sequences into keyboard, mouse, etc., inputs.
pub fn parse<'parser, 'vt, 'input>(
&'parser mut self,
stream: vt::Stream<'vt, 'input>,
) -> Stream<'parser, 'vt, 'input> {
Stream {
parser: self,
stream,
}
}
}
pub struct Stream<'parser, 'vt, 'input> {
parser: &'parser mut Parser,
stream: vt::Stream<'vt, 'input>,
}
impl Stream<'_, '_, '_> {
/// Parses the next input action from the previously given input.
///
/// Can't implement Iterator, because this is a "lending iterator".
pub fn next(&mut self) -> Option<Input> {
if self.parser.want {
return self.parse_x10_mouse_coordinates();
}
let token = self.stream.next()?;
match token {
vt::Token::Text(text) => Some(Input::Text(InputText {
text,
bracketed: false,
})),
vt::Token::Ctrl(ch) => match ch {
'\0' | '\t' | '\r' => Some(Input::Keyboard(InputKey::new(ch as u32))),
..='\x1a' => {
// Shift control code to A-Z
let key = ch as u32 | 0x40;
Some(Input::Keyboard(kbmod::CTRL | InputKey::new(key)))
}
'\x7f' => Some(Input::Keyboard(vk::BACK)),
_ => None,
},
vt::Token::Esc(ch) => {
match ch {
'\0' => Some(Input::Keyboard(vk::ESCAPE)),
' '..='~' => {
let ch = ch as u32;
let key = ch & !0x20; // Shift a-z to A-Z
let modifiers = if (ch & 0x20) != 0 {
kbmod::ALT
} else {
kbmod::ALT_SHIFT
};
Some(Input::Keyboard(modifiers | InputKey::new(key)))
}
_ => None,
}
}
vt::Token::SS3(ch) => {
if ('P'..='S').contains(&ch) {
let key = vk::F1.value() + ch as u32 - 'P' as u32;
Some(Input::Keyboard(InputKey::new(key)))
} else {
None
}
}
vt::Token::Csi(csi) => {
match csi.final_byte {
'A'..='H' => {
const LUT: [u8; 8] = [
vk::UP.value() as u8, // A
vk::DOWN.value() as u8, // B
vk::RIGHT.value() as u8, // C
vk::LEFT.value() as u8, // D
0, // E
vk::END.value() as u8, // F
0, // G
vk::HOME.value() as u8, // H
];
let vk = LUT[csi.final_byte as usize - 'A' as usize];
if vk != 0 {
return Some(Input::Keyboard(
InputKey::new(vk as u32) | Self::parse_modifiers(csi),
));
}
None
}
'Z' => return Some(Input::Keyboard(kbmod::SHIFT | vk::TAB)),
'~' => {
const LUT: [u8; 35] = [
0,
vk::HOME.value() as u8, // 1
vk::INSERT.value() as u8, // 2
vk::DELETE.value() as u8, // 3
vk::END.value() as u8, // 4
vk::PRIOR.value() as u8, // 5
vk::NEXT.value() as u8, // 6
0,
0,
0,
0,
0,
0,
0,
0,
vk::F5.value() as u8, // 15
0,
vk::F6.value() as u8, // 17
vk::F7.value() as u8, // 18
vk::F8.value() as u8, // 19
vk::F9.value() as u8, // 20
vk::F10.value() as u8, // 21
0,
vk::F11.value() as u8, // 23
vk::F12.value() as u8, // 24
vk::F13.value() as u8, // 25
vk::F14.value() as u8, // 26
0,
vk::F15.value() as u8, // 28
vk::F16.value() as u8, // 29
0,
vk::F17.value() as u8, // 31
vk::F18.value() as u8, // 32
vk::F19.value() as u8, // 33
vk::F20.value() as u8, // 34
];
let p0 = csi.params[0];
if p0 >= 0 && p0 <= LUT.len() as i32 {
let vk = LUT[p0 as usize];
if vk != 0 {
return Some(Input::Keyboard(
InputKey::new(vk as u32) | Self::parse_modifiers(csi),
));
}
}
None
}
'm' | 'M' if csi.private_byte == '<' => {
let btn = csi.params[0];
let mut mouse = InputMouse {
state: InputMouseState::None,
modifiers: kbmod::NONE,
position: Point::default(),
scroll: Point::default(),
};
mouse.state = InputMouseState::None;
if (btn & 0x40) != 0 {
mouse.state = InputMouseState::Scroll;
mouse.scroll.y += if (btn & 0x01) != 0 { 3 } else { -3 };
} else if csi.final_byte == 'M' {
const STATES: [InputMouseState; 4] = [
InputMouseState::Left,
InputMouseState::Middle,
InputMouseState::Right,
InputMouseState::None,
];
mouse.state = STATES[(btn as usize) & 0x03];
}
mouse.modifiers = kbmod::NONE;
mouse.modifiers |= if (btn & 0x04) != 0 {
kbmod::SHIFT
} else {
kbmod::NONE
};
mouse.modifiers |= if (btn & 0x08) != 0 {
kbmod::ALT
} else {
kbmod::NONE
};
mouse.modifiers |= if (btn & 0x10f) != 0 {
kbmod::CTRL
} else {
kbmod::NONE
};
mouse.position.x = csi.params[1] - 1;
mouse.position.y = csi.params[2] - 1;
Some(Input::Mouse(mouse))
}
'M' if csi.param_count == 0 => {
self.parser.want = true;
None
}
't' if csi.params[0] == 8 => {
// Window Size
let width = csi.params[2].clamp(1, 32767);
let height = csi.params[1].clamp(1, 32767);
Some(Input::Resize(Size { width, height }))
}
_ => None,
}
}
_ => None,
}
}
/// Implements the X10 mouse protocol via `CSI M CbCxCy`.
///
/// You want to send numeric mouse coordinates.
/// You have CSI sequences with numeric parameters.
/// So, of course you put the coordinates as shifted ASCII characters after
/// the end of the sequence. Limited coordinate range and complicated parsing!
/// This is so puzzling to me. The existence of this function makes me unhappy.
#[cold]
fn parse_x10_mouse_coordinates(&mut self) -> Option<Input> {
self.parser.len += self.stream.read(&mut self.parser.buf[self.parser.len..]);
if self.parser.len < 3 {
return None;
}
let button = self.parser.buf[0] & 0b11;
let modifier = self.parser.buf[0] & 0b11100;
let x = self.parser.buf[1] as i32 - 0x21;
let y = self.parser.buf[2] as i32 - 0x21;
let action = match button {
0 => InputMouseState::Left,
1 => InputMouseState::Middle,
2 => InputMouseState::Right,
_ => InputMouseState::None,
};
let modifiers = match modifier {
4 => kbmod::SHIFT,
8 => kbmod::ALT,
16 => kbmod::CTRL,
_ => kbmod::NONE,
};
self.parser.want = false;
self.parser.len = 0;
Some(Input::Mouse(InputMouse {
state: action,
modifiers,
position: Point { x, y },
scroll: Point::default(),
}))
}
fn parse_modifiers(csi: &vt::Csi) -> InputKeyMod {
let mut modifiers = kbmod::NONE;
let p1 = (csi.params[1] - 1).max(0);
if (p1 & 0x01) != 0 {
modifiers |= kbmod::SHIFT;
}
if (p1 & 0x02) != 0 {
modifiers |= kbmod::ALT;
}
if (p1 & 0x04) != 0 {
modifiers |= kbmod::CTRL;
}
modifiers
}
}

663
src/loc.rs Normal file
View file

@ -0,0 +1,663 @@
use crate::sys;
#[derive(Clone, Copy, PartialEq, Eq)]
pub enum LocId {
Ctrl,
Alt,
Shift,
// File menu
File,
FileSave,
FileSaveAs,
FileExit,
// Edit menu
Edit,
EditUndo,
EditRedo,
EditCut,
EditCopy,
EditPaste,
EditFind,
EditReplace,
// View menu
View,
ViewWordWrap,
// Help menu
Help,
HelpAbout,
// Exit dialog
UnsavedChangesDialogTitle,
UnsavedChangesDialogDescription,
UnsavedChangesDialogYes,
UnsavedChangesDialogNo,
UnsavedChangesDialogCancel,
// About dialog
AboutDialogTitle,
AboutDialogDescription,
AboutDialogVersion,
SearchLabel,
SearchClose,
SearchMatchCase,
SearchWholeWord,
SearchUseRegex,
EncodingReopen,
EncodingConvert,
IndentationTabs,
IndentationSpaces,
SaveAsDialogTitle,
SaveAsDialogFilenameLabel,
Count,
}
#[allow(non_camel_case_types)]
#[derive(Clone, Copy, PartialEq, Eq)]
enum LangId {
// Base language. It's always the first one.
en,
// Other languages. Sorted alphabetically.
de,
es,
fr,
it,
ja,
ko,
pt_br,
ru,
zh_hans,
zh_hant,
Count,
}
#[rustfmt::skip]
const S_LANG_LUT: [[&str; LangId::Count as usize]; LocId::Count as usize] = [
// Ctrl
[
/* en */ "Ctrl",
/* de */ "Strg",
/* es */ "Ctrl",
/* fr */ "Ctrl",
/* it */ "Ctrl",
/* ja */ "Ctrl",
/* ko */ "Ctrl",
/* pt_br */ "Ctrl",
/* ru */ "Ctrl",
/* zh_hans */ "Ctrl",
/* zh_hant */ "Ctrl",
],
// Alt
[
/* en */ "Alt",
/* de */ "Alt",
/* es */ "Alt",
/* fr */ "Alt",
/* it */ "Alt",
/* ja */ "Alt",
/* ko */ "Alt",
/* pt_br */ "Alt",
/* ru */ "Alt",
/* zh_hans */ "Alt",
/* zh_hant */ "Alt",
],
// Shift
[
/* en */ "Shift",
/* de */ "Umschalt",
/* es */ "Mayús",
/* fr */ "Maj",
/* it */ "Maiusc",
/* ja */ "Shift",
/* ko */ "Shift",
/* pt_br */ "Shift",
/* ru */ "Shift",
/* zh_hans */ "Shift",
/* zh_hant */ "Shift",
],
// File
[
/* en */ "File",
/* de */ "Datei",
/* es */ "Archivo",
/* fr */ "Fichier",
/* it */ "File",
/* ja */ "ファイル",
/* ko */ "파일",
/* pt_br */ "Arquivo",
/* ru */ "Файл",
/* zh_hans */ "文件",
/* zh_hant */ "檔案",
],
// FileSave
[
/* en */ "Save",
/* de */ "Speichern",
/* es */ "Guardar",
/* fr */ "Enregistrer",
/* it */ "Salva",
/* ja */ "保存",
/* ko */ "저장",
/* pt_br */ "Salvar",
/* ru */ "Сохранить",
/* zh_hans */ "保存",
/* zh_hant */ "儲存",
],
// FileSaveAs
// NOTE: Exact same translation as SaveAsDialogTitle, and both should be kept in sync.
[
/* en */ "Save As…",
/* de */ "Speichern unter…",
/* es */ "Guardar como…",
/* fr */ "Enregistrer sous…",
/* it */ "Salva come…",
/* ja */ "名前を付けて保存…",
/* ko */ "다른 이름으로 저장…",
/* pt_br */ "Salvar como…",
/* ru */ "Сохранить как…",
/* zh_hans */ "另存为…",
/* zh_hant */ "另存新檔…",
],
// FileExit
[
/* en */ "Exit",
/* de */ "Beenden",
/* es */ "Salir",
/* fr */ "Quitter",
/* it */ "Esci",
/* ja */ "終了",
/* ko */ "종료",
/* pt_br */ "Sair",
/* ru */ "Выход",
/* zh_hans */ "退出",
/* zh_hant */ "退出",
],
// Edit
[
/* en */ "Edit",
/* de */ "Bearbeiten",
/* es */ "Editar",
/* fr */ "Éditer",
/* it */ "Modifica",
/* ja */ "編集",
/* ko */ "편집",
/* pt_br */ "Editar",
/* ru */ "Правка",
/* zh_hans */ "编辑",
/* zh_hant */ "編輯",
],
// EditUndo
[
/* en */ "Undo",
/* de */ "Rückgängig",
/* es */ "Deshacer",
/* fr */ "Annuler",
/* it */ "Annulla",
/* ja */ "元に戻す",
/* ko */ "실행 취소",
/* pt_br */ "Desfazer",
/* ru */ "Отменить",
/* zh_hans */ "撤销",
/* zh_hant */ "復原",
],
// EditRedo
[
/* en */ "Redo",
/* de */ "Wiederholen",
/* es */ "Rehacer",
/* fr */ "Rétablir",
/* it */ "Ripeti",
/* ja */ "やり直し",
/* ko */ "다시 실행",
/* pt_br */ "Refazer",
/* ru */ "Повторить",
/* zh_hans */ "重做",
/* zh_hant */ "重做",
],
// EditCut
[
/* en */ "Cut",
/* de */ "Ausschneiden",
/* es */ "Cortar",
/* fr */ "Couper",
/* it */ "Taglia",
/* ja */ "切り取り",
/* ko */ "잘라내기",
/* pt_br */ "Cortar",
/* ru */ "Вырезать",
/* zh_hans */ "剪切",
/* zh_hant */ "剪下",
],
// EditCopy
[
/* en */ "Copy",
/* de */ "Kopieren",
/* es */ "Copiar",
/* fr */ "Copier",
/* it */ "Copia",
/* ja */ "コピー",
/* ko */ "복사",
/* pt_br */ "Copiar",
/* ru */ "Копировать",
/* zh_hans */ "复制",
/* zh_hant */ "複製",
],
// EditPaste
[
/* en */ "Paste",
/* de */ "Einfügen",
/* es */ "Pegar",
/* fr */ "Coller",
/* it */ "Incolla",
/* ja */ "貼り付け",
/* ko */ "붙여넣기",
/* pt_br */ "Colar",
/* ru */ "Вставить",
/* zh_hans */ "粘贴",
/* zh_hant */ "貼上",
],
// EditFind
[
/* en */ "Find",
/* de */ "Suchen",
/* es */ "Buscar",
/* fr */ "Rechercher",
/* it */ "Trova",
/* ja */ "検索",
/* ko */ "찾기",
/* pt_br */ "Encontrar",
/* ru */ "Найти",
/* zh_hans */ "查找",
/* zh_hant */ "尋找",
],
// EditReplace
[
/* en */ "Replace",
/* de */ "Ersetzen",
/* es */ "Reemplazar",
/* fr */ "Remplacer",
/* it */ "Sostituisci",
/* ja */ "置換",
/* ko */ "바꾸기",
/* pt_br */ "Substituir",
/* ru */ "Заменить",
/* zh_hans */ "替换",
/* zh_hant */ "取代",
],
// View
[
/* en */ "View",
/* de */ "Ansicht",
/* es */ "Ver",
/* fr */ "Affichage",
/* it */ "Visualizza",
/* ja */ "表示",
/* ko */ "보기",
/* pt_br */ "Exibir",
/* ru */ "Вид",
/* zh_hans */ "视图",
/* zh_hant */ "檢視",
],
// ViewWordWrap
[
/* en */ "Word Wrap",
/* de */ "Zeilenumbruch",
/* es */ "Ajuste de línea",
/* fr */ "Retour à la ligne",
/* it */ "A capo automatico",
/* ja */ "折り返し",
/* ko */ "자동 줄 바꿈",
/* pt_br */ "Quebra de linha",
/* ru */ "Перенос слов",
/* zh_hans */ "自动换行",
/* zh_hant */ "自動換行",
],
// Help
[
/* en */ "Help",
/* de */ "Hilfe",
/* es */ "Ayuda",
/* fr */ "Aide",
/* it */ "Aiuto",
/* ja */ "ヘルプ",
/* ko */ "도움말",
/* pt_br */ "Ajuda",
/* ru */ "Помощь",
/* zh_hans */ "帮助",
/* zh_hant */ "幫助",
],
// HelpAbout
[
/* en */ "About",
/* de */ "Über",
/* es */ "Acerca de",
/* fr */ "À propos",
/* it */ "Informazioni",
/* ja */ "情報",
/* ko */ "정보",
/* pt_br */ "Sobre",
/* ru */ "О программе",
/* zh_hans */ "关于",
/* zh_hant */ "關於",
],
// UnsavedChangesDialogTitle
[
/* en */ "Unsaved Changes",
/* de */ "Ungespeicherte Änderungen",
/* es */ "Cambios sin guardar",
/* fr */ "Modifications non enregistrées",
/* it */ "Modifiche non salvate",
/* ja */ "未保存の変更",
/* ko */ "저장되지 않은 변경 사항",
/* pt_br */ "Alterações não salvas",
/* ru */ "Несохраненные изменения",
/* zh_hans */ "未保存的更改",
/* zh_hant */ "未儲存的變更",
],
// UnsavedChangesDialogDescription
[
/* en */ "Do you want to save the changes you made?",
/* de */ "Möchten Sie die vorgenommenen Änderungen speichern?",
/* es */ "¿Desea guardar los cambios realizados?",
/* fr */ "Voulez-vous enregistrer les modifications apportées?",
/* it */ "Vuoi salvare le modifiche apportate?",
/* ja */ "変更内容を保存しますか?",
/* ko */ "변경한 내용을 저장하시겠습니까?",
/* pt_br */ "Deseja salvar as alterações feitas?",
/* ru */ "Вы хотите сохранить внесённые изменения?",
/* zh_hans */ "您要保存所做的更改吗?",
/* zh_hant */ "您要保存所做的變更嗎?",
],
// UnsavedChangesDialogYes
[
/* en */ "Save",
/* de */ "Speichern",
/* es */ "Guardar",
/* fr */ "Enregistrer",
/* it */ "Salva",
/* ja */ "保存",
/* ko */ "저장",
/* pt_br */ "Salvar",
/* ru */ "Сохранить",
/* zh_hans */ "保存",
/* zh_hant */ "儲存",
],
// UnsavedChangesDialogNo
[
/* en */ "Don't Save",
/* de */ "Nicht speichern",
/* es */ "No guardar",
/* fr */ "Ne pas enregistrer",
/* it */ "Non salvare",
/* ja */ "保存しない",
/* ko */ "저장 안 함",
/* pt_br */ "Não salvar",
/* ru */ "Не сохранять",
/* zh_hans */ "不保存",
/* zh_hant */ "不儲存",
],
// UnsavedChangesDialogCancel
[
/* en */ "Cancel",
/* de */ "Abbrechen",
/* es */ "Cancelar",
/* fr */ "Annuler",
/* it */ "Annulla",
/* ja */ "キャンセル",
/* ko */ "취소",
/* pt_br */ "Cancelar",
/* ru */ "Отмена",
/* zh_hans */ "取消",
/* zh_hant */ "取消",
],
// AboutDialogTitle
[
/* en */ "About",
/* de */ "Über",
/* es */ "Acerca de",
/* fr */ "À propos",
/* it */ "Informazioni",
/* ja */ "情報",
/* ko */ "정보",
/* pt_br */ "Sobre",
/* ru */ "О программе",
/* zh_hans */ "关于",
/* zh_hant */ "關於",
],
// AboutDialogDescription
[
/* en */ "Grug's favorite editor",
/* de */ "Grugs Lieblingseditor",
/* es */ "El editor favorito de Grug",
/* fr */ "L'éditeur préféré de Grug",
/* it */ "L'editor preferito di Grug",
/* ja */ "Grugのお気に入りエディタ",
/* ko */ "Grug이 가장 좋아하는 편집기",
/* pt_br */ "O editor favorito do Grug",
/* ru */ "Любимый редактор Груга",
/* zh_hans */ "Grug最喜欢的编辑器",
/* zh_hant */ "Grug最喜歡的編輯器",
],
// AboutDialogVersion
[
/* en */ "Version: ",
/* de */ "Version: ",
/* es */ "Versión: ",
/* fr */ "Version : ",
/* it */ "Versione: ",
/* ja */ "バージョン: ",
/* ko */ "버전: ",
/* pt_br */ "Versão: ",
/* ru */ "Версия: ",
/* zh_hans */ "版本:",
/* zh_hant */ "版本:",
],
// SearchLabel
[
/* en */ "Find:",
/* de */ "Suchen:",
/* es */ "Buscar:",
/* fr */ "Rechercher:",
/* it */ "Trova:",
/* ja */ "検索:",
/* ko */ "찾기:",
/* pt_br */ "Encontrar:",
/* ru */ "Найти:",
/* zh_hans */ "查找:",
/* zh_hant */ "尋找:",
],
// SearchClose
[
/* en */ "Close",
/* de */ "Schließen",
/* es */ "Cerrar",
/* fr */ "Fermer",
/* it */ "Chiudi",
/* ja */ "閉じる",
/* ko */ "닫기",
/* pt_br */ "Fechar",
/* ru */ "Закрыть",
/* zh_hans */ "关闭",
/* zh_hant */ "關閉",
],
// SearchMatchCase
[
/* en */ "Match Case",
/* de */ "Groß/Klein",
/* es */ "May/Min",
/* fr */ "Casse",
/* it */ "Maius/minus",
/* ja */ "大/小文字",
/* ko */ "대소문자",
/* pt_br */ "Maius/minus",
/* ru */ "Регистр",
/* zh_hans */ "区分大小写",
/* zh_hant */ "區分大小寫",
],
// SearchWholeWord
[
/* en */ "Whole Word",
/* de */ "Ganzes Wort",
/* es */ "Palabra",
/* fr */ "Mot entier",
/* it */ "Parola",
/* ja */ "単語単位",
/* ko */ "전체 단어",
/* pt_br */ "Palavra",
/* ru */ "Слово",
/* zh_hans */ "全字匹配",
/* zh_hant */ "全字匹配",
],
// SearchUseRegex
[
/* en */ "Use Regex",
/* de */ "RegEx",
/* es */ "RegEx",
/* fr */ "RegEx",
/* it */ "RegEx",
/* ja */ "正規表現",
/* ko */ "정규식",
/* pt_br */ "RegEx",
/* ru */ "RegEx",
/* zh_hans */ "正则",
/* zh_hant */ "正則",
],
// EncodingReopen
[
/* en */ "Reopen with encoding",
/* de */ "Mit Kodierung erneut öffnen",
/* es */ "Reabrir con codificación",
/* fr */ "Rouvrir avec un encodage différent",
/* it */ "Riapri con codifica",
/* ja */ "エンコーディングで再度開く",
/* ko */ "인코딩으로 다시 열기",
/* pt_br */ "Reabrir com codificação",
/* ru */ "Открыть снова с кодировкой",
/* zh_hans */ "使用编码重新打开",
/* zh_hant */ "使用編碼重新打開",
],
// EncodingConvert
[
/* en */ "Convert to encoding",
/* de */ "In Kodierung konvertieren",
/* es */ "Convertir a otra codificación",
/* fr */ "Convertir en encodage",
/* it */ "Converti in codifica",
/* ja */ "エンコーディングに変換",
/* ko */ "인코딩으로 변환",
/* pt_br */ "Converter para codificação",
/* ru */ "Преобразовать в кодировку",
/* zh_hans */ "转换为编码",
/* zh_hant */ "轉換為編碼",
],
// IndentationTabs
[
/* en */ "Tabs",
/* de */ "Tabs",
/* es */ "Tabulaciones",
/* fr */ "Tabulations",
/* it */ "Tabulazioni",
/* ja */ "タブ",
/* ko */ "",
/* pt_br */ "Tabulações",
/* ru */ "Табы",
/* zh_hans */ "制表符",
/* zh_hant */ "製表符",
],
// IndentationSpaces
[
/* en */ "Spaces",
/* de */ "Leerzeichen",
/* es */ "Espacios",
/* fr */ "Espaces",
/* it */ "Spazi",
/* ja */ "スペース",
/* ko */ "공백",
/* pt_br */ "Espaços",
/* ru */ "Пробелы",
/* zh_hans */ "空格",
/* zh_hant */ "空格",
],
// SaveAsDialogTitle
// NOTE: Exact same translation as FileSaveAs, and both should be kept in sync.
[
/* en */ "Save As…",
/* de */ "Speichern unter…",
/* es */ "Guardar como…",
/* fr */ "Enregistrer sous…",
/* it */ "Salva come…",
/* ja */ "名前を付けて保存…",
/* ko */ "다른 이름으로 저장…",
/* pt_br */ "Salvar como…",
/* ru */ "Сохранить как…",
/* zh_hans */ "另存为…",
/* zh_hant */ "另存新檔…",
],
// SaveAsDialogFilenameLabel
[
/* en */ "Filename:",
/* de */ "Dateiname:",
/* es */ "Nombre de archivo:",
/* fr */ "Nom de fichier :",
/* it */ "Nome del file:",
/* ja */ "ファイル名:",
/* ko */ "파일 이름:",
/* pt_br */ "Nome do arquivo:",
/* ru */ "Имя файла:",
/* zh_hans */ "文件名:",
/* zh_hant */ "檔案名稱:",
],
];
static mut S_LANG: LangId = LangId::en;
pub fn init() {
let langs = sys::preferred_languages();
let mut lang = LangId::en;
for l in langs {
lang = match l.as_str() {
"en" => LangId::en,
"de" => LangId::de,
"es" => LangId::es,
"fr" => LangId::fr,
"it" => LangId::it,
"ja" => LangId::ja,
"ko" => LangId::ko,
"pt-br" => LangId::pt_br,
"ru" => LangId::ru,
"zh-hant" => LangId::zh_hant,
"zh" => LangId::zh_hans,
_ => continue,
};
break;
}
unsafe {
S_LANG = lang;
}
}
pub fn loc(id: LocId) -> &'static str {
S_LANG_LUT[id as usize][unsafe { S_LANG as usize }]
}

1067
src/main.rs Normal file

File diff suppressed because it is too large Load diff

491
src/memchr.rs Normal file
View file

@ -0,0 +1,491 @@
//! Rust has a very popular `memchr` crate. It's quite fast, so you may ask yourself
//! why we don't just use it: Simply put, this is optimized for short inputs.
use std::ptr::null;
/// memchr(), but with two needles.
/// Returns the index of the first occurrence of either needle in the `haystack`.
/// If no needle is found, `haystack.len()` is returned.
/// `offset` specifies the index to start searching from.
pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8], offset: usize) -> usize {
unsafe {
let beg = haystack.as_ptr();
let end = beg.add(haystack.len());
let it = beg.add(offset.min(haystack.len()));
let it = memchr2_raw(needle1, needle2, it, end);
distance(it, beg)
}
}
// In order to make `memchr2_raw` slim and fast, we use a function pointer that updates
// itself to the correct implementation on the first call. This reduces binary size.
// It would also reduce branches if we had >2 implementations (a jump still needs to be predicted).
// NOTE that this ONLY works if Control Flow Guard is disabled on Windows.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
static mut MEMCHR2_DISPATCH: unsafe fn(
needle1: u8,
needle2: u8,
beg: *const u8,
end: *const u8,
) -> *const u8 = memchr2_dispatch;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn memchr2_dispatch(needle1: u8, needle2: u8, beg: *const u8, end: *const u8) -> *const u8 {
let func = if is_x86_feature_detected!("avx2") {
memchr2_avx2
} else {
memchr2_fallback
};
unsafe { MEMCHR2_DISPATCH = func };
unsafe { func(needle1, needle2, beg, end) }
}
unsafe fn memchr2_raw(needle1: u8, needle2: u8, beg: *const u8, end: *const u8) -> *const u8 {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
return unsafe { MEMCHR2_DISPATCH(needle1, needle2, beg, end) };
#[cfg(target_arch = "aarch64")]
return unsafe { memchr2_neon(needle1, needle2, beg, end) };
#[allow(unreachable_code)]
return unsafe { memchr2_fallback(needle1, needle2, beg, end) };
}
unsafe fn memchr2_fallback(
needle1: u8,
needle2: u8,
mut beg: *const u8,
end: *const u8,
) -> *const u8 {
unsafe {
while beg != end {
let ch = *beg;
if ch == needle1 || ch == needle2 {
break;
}
beg = beg.add(1);
}
beg
}
}
// FWIW, I found that adding support for AVX512 was not useful at the time,
// as it only marginally improved file load performance by <5%.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn memchr2_avx2(needle1: u8, needle2: u8, mut beg: *const u8, end: *const u8) -> *const u8 {
unsafe {
use std::arch::x86_64::*;
let n1 = _mm256_set1_epi8(needle1 as i8);
let n2 = _mm256_set1_epi8(needle2 as i8);
let mut remaining = distance(end, beg);
while remaining >= 32 {
let v = _mm256_loadu_si256(beg as *const _);
let a = _mm256_cmpeq_epi8(v, n1);
let b = _mm256_cmpeq_epi8(v, n2);
let c = _mm256_or_si256(a, b);
let m = _mm256_movemask_epi8(c) as u32;
if m != 0 {
return beg.add(m.trailing_zeros() as usize);
}
beg = beg.add(32);
remaining -= 32;
}
memchr2_fallback(needle1, needle2, beg, end)
// TODO: This code probably works correctly but requires more testing.
/*
// Handle the remaining <32 bytes by reading 32 bytes and masking out the irrelevant data.
// This works, because x86 does not care about slice boundaries. It does care about page boundaries, however.
if remaining > 0 {
// Data beyond the beg/end range may not be mapped in. As such, we need to avoid reading beyond the
// page boundaries. This assumes 4KiB pages or larger. If we're in the lower half of the 4KiB page,
// we load data from `end.sub(off) == end.sub(remaining) == beg`, since we know that this 32-byte read
// can't possibly read 2KiB. Otherwise, we load from `end.sub(off) == end.sub(32)`, which essentially
// means we read such that the end of the read is aligned with the end of the haystack. The start of the
// SIMD register will then contain garbage we must ignore.
let off = if ((beg as usize) & 2048) != 0 {
32
} else {
remaining
};
let v = _mm256_loadu_si256(end.sub(off) as *const _);
let a = _mm256_cmpeq_epi8(v, n1);
let b = _mm256_cmpeq_epi8(v, n2);
let c = _mm256_or_si256(a, b);
let m = _mm256_movemask_epi8(c) as u32;
// If we were in the upper half of the 4KiB page, we must shift the mask such that it's not aligned with
// the end of the haystack but rather with the current `beg`: A shift of `32 - remaining` is needed,
// which equals `off - remaining`. Otherwise, we must not shift at all. Luckily `off` will be `remaining`
// in that case and `remaining - remaining` is 0.
let m = m >> (off - remaining);
// If we were in the lower half of the 4KiB page, we must mask out anything beyond the end of
// the haystack. Here, we basically restrict the "length" if `m` to contain `remaining`-many bits.
// In case of a read in the upper half this won't do anything, but that's fine. Branchless code is great.
let m = m & ((1 << remaining) - 1);
if m != 0 {
return beg.add(m.trailing_zeros() as usize);
}
}
end
*/
}
}
#[cfg(target_arch = "aarch64")]
unsafe fn memchr2_neon(needle1: u8, needle2: u8, mut beg: *const u8, end: *const u8) -> *const u8 {
unsafe {
use std::arch::aarch64::*;
if distance(end, beg) >= 16 {
let n1 = vdupq_n_u8(needle1);
let n2 = vdupq_n_u8(needle2);
loop {
let v = vld1q_u8(beg as *const _);
let a = vceqq_u8(v, n1);
let b = vceqq_u8(v, n2);
let c = vorrq_u8(a, b);
// https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
let m = vreinterpretq_u16_u8(c);
let m = vshrn_n_u16(m, 4);
let m = vreinterpret_u64_u8(m);
let m = vget_lane_u64(m, 0);
if m != 0 {
return beg.add(m.trailing_zeros() as usize >> 2);
}
beg = beg.add(16);
if distance(end, beg) < 16 {
break;
}
}
}
memchr2_fallback(needle1, needle2, beg, end)
}
}
/// Same as `memchr2`, but searches from the end of the haystack.
/// If no needle is found, 0 is returned.
///
/// *NOTE: Unlike `memchr2` (or `memrchr`), an offset PAST the hit is returned.*
/// This is because this function is primarily used for `ucd::newlines_backward`,
/// which needs exactly that.
pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8], offset: usize) -> Option<usize> {
unsafe {
let beg = haystack.as_ptr();
let it = beg.add(offset.min(haystack.len()));
let it = memrchr2_raw(needle1, needle2, beg, it);
if it.is_null() {
None
} else {
Some(distance(it, beg))
}
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
static mut MEMRCHR2_DISPATCH: unsafe fn(
needle1: u8,
needle2: u8,
beg: *const u8,
end: *const u8,
) -> *const u8 = memrchr2_dispatch;
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
unsafe fn memrchr2_dispatch(needle1: u8, needle2: u8, beg: *const u8, end: *const u8) -> *const u8 {
let func = if is_x86_feature_detected!("avx2") {
memrchr2_avx2
} else {
memrchr2_fallback
};
unsafe { MEMRCHR2_DISPATCH = func };
unsafe { func(needle1, needle2, beg, end) }
}
unsafe fn memrchr2_raw(needle1: u8, needle2: u8, beg: *const u8, end: *const u8) -> *const u8 {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
return unsafe { MEMRCHR2_DISPATCH(needle1, needle2, beg, end) };
#[cfg(target_arch = "aarch64")]
return unsafe { memrchr2_neon(needle1, needle2, beg, end) };
#[allow(unreachable_code)]
return unsafe { memrchr2_fallback(needle1, needle2, beg, end) };
}
unsafe fn memrchr2_fallback(
needle1: u8,
needle2: u8,
beg: *const u8,
mut end: *const u8,
) -> *const u8 {
unsafe {
while end != beg {
end = end.sub(1);
let ch = *end;
if ch == needle1 || needle2 == ch {
return end;
}
}
null()
}
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn memrchr2_avx2(needle1: u8, needle2: u8, beg: *const u8, mut end: *const u8) -> *const u8 {
unsafe {
use std::arch::x86_64::*;
if distance(end, beg) >= 32 {
let n1 = _mm256_set1_epi8(needle1 as i8);
let n2 = _mm256_set1_epi8(needle2 as i8);
loop {
end = end.sub(32);
let v = _mm256_loadu_si256(end as *const _);
let a = _mm256_cmpeq_epi8(v, n1);
let b = _mm256_cmpeq_epi8(v, n2);
let c = _mm256_or_si256(a, b);
let m = _mm256_movemask_epi8(c) as u32;
if m != 0 {
return end.add(31 - m.leading_zeros() as usize);
}
if distance(end, beg) < 32 {
break;
}
}
}
memrchr2_fallback(needle1, needle2, beg, end)
}
}
#[cfg(target_arch = "aarch64")]
unsafe fn memrchr2_neon(needle1: u8, needle2: u8, beg: *const u8, mut end: *const u8) -> *const u8 {
unsafe {
use std::arch::aarch64::*;
if distance(end, beg) >= 16 {
let n1 = vdupq_n_u8(needle1);
let n2 = vdupq_n_u8(needle2);
loop {
end = end.sub(16);
let v = vld1q_u8(end as *const _);
let a = vceqq_u8(v, n1);
let b = vceqq_u8(v, n2);
let c = vorrq_u8(a, b);
// https://community.arm.com/arm-community-blogs/b/servers-and-cloud-computing-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
let m = vreinterpretq_u16_u8(c);
let m = vshrn_n_u16(m, 4);
let m = vreinterpret_u64_u8(m);
let m = vget_lane_u64(m, 0);
if m != 0 {
return end.add(15 - (m.leading_zeros() as usize >> 2));
}
if distance(end, beg) < 16 {
break;
}
}
}
memrchr2_fallback(needle1, needle2, beg, end)
}
}
/*pub struct Memchr2<'a> {
needle1: u8,
needle2: u8,
beg: *const u8,
end: *const u8,
it: *const u8,
_marker: PhantomData<&'a [u8]>,
}
impl<'a> Memchr2<'a> {
pub fn new(needle1: u8, needle2: u8, haystack: &'a [u8]) -> Self {
Self {
needle1,
needle2,
beg: haystack.as_ptr(),
end: unsafe { haystack.as_ptr().add(haystack.len()) },
it: haystack.as_ptr(),
_marker: PhantomData,
}
}
}
impl Iterator for Memchr2<'_> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
if self.it.is_null() {
return None;
}
self.it = unsafe { memchr2_raw(self.needle1, self.needle2, self.it, self.end) };
if self.it.is_null() {
return None;
}
let idx = unsafe { distance(self.it, self.beg) };
self.it = if self.it == self.end {
null()
} else {
unsafe { self.it.add(1) }
};
Some(idx)
}
}
impl FusedIterator for Memchr2<'_> {}
pub struct memrchr2<'a> {
needle1: u8,
needle2: u8,
beg: *const u8,
it: *const u8,
_marker: PhantomData<&'a [u8]>,
}
impl<'a> memrchr2<'a> {
pub fn new(needle1: u8, needle2: u8, haystack: &'a [u8]) -> Self {
Self {
needle1,
needle2,
beg: haystack.as_ptr(),
it: unsafe { haystack.as_ptr().add(haystack.len()) },
_marker: PhantomData,
}
}
}
impl Iterator for memrchr2<'_> {
type Item = usize;
fn next(&mut self) -> Option<Self::Item> {
if self.it.is_null() {
return None;
}
self.it = unsafe { memrchr2_raw(self.needle1, self.needle2, self.beg, self.it) };
if self.it.is_null() {
return None;
}
let idx = unsafe { distance(self.it, self.beg) };
self.it = if self.it == self.beg {
null()
} else {
unsafe { self.it.sub(1) }
};
Some(idx)
}
}
impl FusedIterator for memrchr2<'_> {}*/
// Can be replaced with `sub_ptr` once it's stabilized.
#[inline(always)]
unsafe fn distance<T>(hi: *const T, lo: *const T) -> usize {
unsafe { usize::try_from(hi.offset_from(lo)).unwrap_unchecked() }
}
#[cfg(test)]
mod tests {
use super::*;
use crate::sys;
use std::slice;
#[test]
fn test_memchr2_empty() {
assert_eq!(memchr2(b'a', b'b', b"", 0), 0);
}
#[test]
fn test_empty() {
assert_eq!(memrchr2(b'a', b'b', b"", 0), None);
}
#[test]
fn test_basic() {
let haystack = b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
let haystack = &haystack[..43];
assert_eq!(memchr2(b'a', b'z', haystack, 0), 0);
assert_eq!(memchr2(b'p', b'q', haystack, 0), 15);
assert_eq!(memchr2(b'Q', b'Z', haystack, 0), 42);
assert_eq!(memchr2(b'0', b'9', haystack, 0), haystack.len());
assert_eq!(memrchr2(b'Q', b'P', haystack, 43), Some(42));
assert_eq!(memrchr2(b'p', b'o', haystack, 43), Some(15));
assert_eq!(memrchr2(b'a', b'b', haystack, 43), Some(1));
assert_eq!(memrchr2(b'0', b'9', haystack, 43), None);
}
// Test that it doesn't match before/after the start offset respectively.
#[test]
fn test_with_offset() {
let haystack = b"abcdefghabcdefghabcdefghabcdefghabcdefgh";
assert_eq!(memrchr2(b'h', b'g', haystack, 40), Some(39));
assert_eq!(memrchr2(b'h', b'g', haystack, 39), Some(38));
assert_eq!(memrchr2(b'a', b'b', haystack, 9), Some(8));
assert_eq!(memrchr2(b'a', b'b', haystack, 1), Some(0));
assert_eq!(memrchr2(b'a', b'b', haystack, 0), None);
assert_eq!(memchr2(b'a', b'b', haystack, 0), 0);
assert_eq!(memchr2(b'a', b'b', haystack, 1), 1);
assert_eq!(memchr2(b'a', b'b', haystack, 2), 8);
assert_eq!(memchr2(b'a', b'b', haystack, 9), 9);
assert_eq!(memchr2(b'a', b'b', haystack, 16), 16);
assert_eq!(memchr2(b'a', b'b', haystack, 41), 40);
}
// Test memory access safety at page boundaries.
// The test is a success if it doesn't segfault.
#[test]
fn test_page_boundary() {
let page = unsafe {
let page_size = 4096;
// 3 pages: uncommitted, committed, uncommitted
let ptr = sys::virtual_reserve(page_size * 3).unwrap() as *mut u8;
sys::virtual_commit(ptr.add(page_size), page_size).unwrap();
slice::from_raw_parts_mut(ptr.add(page_size), page_size)
};
page.fill(b'a');
// Test if it seeks beyond the page boundary.
assert_eq!(memchr2(b'\0', b'\0', &page[page.len() - 40..], 0), 40);
// Test if it seeks before the page boundary for the masked/partial load.
assert_eq!(memchr2(b'\0', b'\0', &page[..10], 0), 10);
// Same as above, but for memrchr2 (hence reversed).
assert_eq!(memrchr2(b'\0', b'\0', &page[page.len() - 10..], 10), None);
assert_eq!(memrchr2(b'\0', b'\0', &page[..40], 40), None);
}
}

10
src/sys.rs Normal file
View file

@ -0,0 +1,10 @@
#[cfg(unix)]
mod unix;
#[cfg(windows)]
#[macro_use]
mod windows;
#[cfg(unix)]
pub use unix::*;
#[cfg(windows)]
pub use windows::*;

353
src/sys/unix.rs Normal file
View file

@ -0,0 +1,353 @@
use crate::apperr;
use std::ffi::{CStr, c_int, c_void};
use std::fs::File;
use std::io::{ErrorKind, Read, Write};
use std::mem::{self, ManuallyDrop, MaybeUninit};
use std::os::fd::FromRawFd;
use std::ptr::{null, null_mut};
use std::thread;
use std::time;
pub fn preferred_languages() -> Vec<String> {
let mut locales = Vec::new();
for key in ["LANGUAGE", "LC_ALL", "LANG"] {
if let Ok(val) = std::env::var(key) {
locales.extend(
val.split(':')
.filter(|val| !val.is_empty())
.map(String::from),
);
}
}
locales
}
extern "C" fn sigwinch_handler(_: libc::c_int) {
unsafe {
STATE.inject_resize = true;
}
}
pub fn init() -> apperr::Result<()> {
unsafe {
// Reopen stdin/stdout if they're redirected.
if libc::isatty(STATE.stdin) == 0 {
STATE.stdin = check_int_return(libc::open(c"/dev/tty".as_ptr(), libc::O_RDONLY))?;
}
if libc::isatty(STATE.stdout) == 0 {
STATE.stdout = check_int_return(libc::open(c"/dev/tty".as_ptr(), libc::O_WRONLY))?;
}
check_int_return(libc::tcgetattr(
STATE.stdout,
&raw mut STATE.stdout_initial_termios,
))?;
let mut termios = STATE.stdout_initial_termios;
termios.c_lflag &= !(libc::ICANON | libc::ECHO);
check_int_return(libc::tcsetattr(STATE.stdout, libc::TCSANOW, &termios))?;
// Set STATE.inject_resize to true whenever we get a SIGWINCH.
let mut sigwinch_action: libc::sigaction = mem::zeroed();
sigwinch_action.sa_sigaction = sigwinch_handler as libc::sighandler_t;
check_int_return(libc::sigaction(
libc::SIGWINCH,
&sigwinch_action,
null_mut(),
))?;
Ok(())
}
}
pub fn deinit() {
unsafe {
libc::tcsetattr(
STATE.stdout,
libc::TCSANOW,
&raw mut STATE.stdout_initial_termios,
);
}
}
pub fn inject_window_size_into_stdin() {
unsafe {
STATE.inject_resize = true;
}
}
fn get_window_size() -> (u16, u16) {
let mut w = 0;
let mut h = 0;
for attempt in 1.. {
let winsz = unsafe {
let mut winsz: libc::winsize = mem::zeroed();
libc::ioctl(STATE.stdout, libc::TIOCGWINSZ, &raw mut winsz);
winsz
};
w = winsz.ws_col;
h = winsz.ws_row;
if w != 0 && h != 0 {
break;
}
if attempt == 10 {
w = 80;
h = 24;
break;
}
// Some terminals are bad emulators and don't report TIOCGWINSZ immediately.
thread::sleep(time::Duration::from_millis(10 * attempt));
}
(w, h)
}
struct State {
stdin: libc::c_int,
stdout: libc::c_int,
stdout_initial_termios: libc::termios,
inject_resize: bool,
// Buffer for incomplete UTF-8 sequences (max 4 bytes needed)
utf8_buf: [u8; 4],
utf8_len: usize,
}
static mut STATE: State = State {
stdin: libc::STDIN_FILENO,
stdout: libc::STDOUT_FILENO,
stdout_initial_termios: unsafe { mem::zeroed() },
inject_resize: false,
utf8_buf: [0; 4],
utf8_len: 0,
};
/// Reads from stdin.
///
/// Returns `None` if there was an error reading from stdin.
/// Returns `Some("")` if the given timeout was reached.
/// Otherwise, it returns the read, non-empty string.
pub fn read_stdin(timeout: Option<time::Duration>) -> Option<String> {
unsafe {
if let Some(timeout) = timeout {
let mut pollfd = libc::pollfd {
fd: STATE.stdin,
events: libc::POLLIN,
revents: 0,
};
let ts = libc::timespec {
tv_sec: timeout.as_secs() as libc::time_t,
tv_nsec: timeout.subsec_nanos() as libc::c_long,
};
let ret = libc::ppoll(&mut pollfd, 1, &ts, null());
if ret < 0 {
return None;
}
if ret == 0 {
return Some(String::new());
}
}
#[allow(invalid_value)]
let mut buf: [u8; 1024] = MaybeUninit::uninit().assume_init();
let mut read = 0;
if STATE.utf8_len != 0 {
read = STATE.utf8_len;
input[..read].copy_from_slice(&STATE.utf8_buf[..read]);
}
loop {
if STATE.inject_resize {
STATE.inject_resize = false;
let (w, h) = get_window_size();
return Some(format!("\x1b[8;{};{}t", h, w));
}
// Read new data
let n = loop {
let ret = libc::read(STATE.stdin, buf.as_mut_ptr() as *mut _, buf.len());
if ret > 0 {
break ret as usize;
}
if ret == 0 {
return None;
}
if *libc::__errno_location() != libc::EINTR {
return None;
}
};
// Prepend any cached incomplete UTF-8 sequence
let input = if STATE.utf8_len > 0 {
let total = STATE.utf8_len + n;
let mut combined = Vec::with_capacity(total);
combined.extend_from_slice(&STATE.utf8_buf[..STATE.utf8_len]);
combined.extend_from_slice(&buf[..n]);
STATE.utf8_len = 0;
combined
} else {
buf[..n].to_vec()
};
// Find last complete UTF-8 sequence
let mut valid_end = input.len();
while valid_end > 0 && (input[valid_end - 1] & 0xC0) == 0x80 {
valid_end -= 1;
if input.len() - valid_end >= 4 || valid_end == 0 {
// Either too many trail bytes or all trail bytes - invalid UTF-8
valid_end = input.len();
break;
}
}
// Cache incomplete sequence if any
if valid_end < input.len() {
let remaining = input.len() - valid_end;
STATE.utf8_buf[..remaining].copy_from_slice(&input[valid_end..]);
STATE.utf8_len = remaining;
}
// Convert valid portion to string
if let Ok(s) = String::from_utf8(input[..valid_end].to_vec()) {
if !s.is_empty() {
return Some(s);
}
}
}
}
}
pub fn write_stdout(text: &str) {
let buf = text.as_bytes();
let mut written = 0;
while written < buf.len() {
let w = &buf[written..];
let n = unsafe { libc::write(STATE.stdout, w.as_ptr() as *const _, w.len()) };
if n >= 0 {
written += n as usize;
continue;
}
let err = unsafe { *libc::__errno_location() };
if err != libc::EINTR {
return;
}
}
}
pub fn open_stdin_if_redirected() -> Option<File> {
unsafe {
if libc::isatty(libc::STDIN_FILENO) == 0 {
Some(File::from_raw_fd(libc::STDIN_FILENO))
} else {
None
}
}
}
pub unsafe fn virtual_reserve(size: usize) -> apperr::Result<*mut u8> {
unsafe {
let ptr = libc::mmap(
null_mut(),
size,
libc::PROT_NONE,
libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
-1,
0,
);
if ptr == libc::MAP_FAILED {
Err(apperr::Error::new(libc::ENOMEM as u32))
} else {
Ok(ptr as *mut u8)
}
}
}
pub unsafe fn virtual_release(base: *mut u8, size: usize) {
unsafe {
libc::munmap(base as *mut libc::c_void, size);
}
}
pub unsafe fn virtual_commit(base: *mut u8, size: usize) -> apperr::Result<()> {
unsafe {
let status = libc::mprotect(
base as *mut libc::c_void,
size,
libc::PROT_READ | libc::PROT_WRITE,
);
if status != 0 {
Err(apperr::Error::new(libc::ENOMEM as u32))
} else {
Ok(())
}
}
}
unsafe fn load_library(name: &CStr) -> apperr::Result<*mut c_void> {
unsafe {
let handle = libc::dlopen(name.as_ptr(), libc::RTLD_LAZY);
if handle.is_null() {
Err(apperr::Error::new(libc::ELIBACC as u32))
} else {
Ok(handle)
}
}
}
// It'd be nice to constrain T to std::marker::FnPtr, but that's unstable.
pub unsafe fn get_proc_address<T>(handle: *mut c_void, name: &CStr) -> apperr::Result<T> {
unsafe {
let sym = libc::dlsym(handle, name.as_ptr());
if sym.is_null() {
Err(apperr::Error::new(libc::ELIBACC as u32))
} else {
Ok(mem::transmute_copy(&sym))
}
}
}
pub unsafe fn load_icu() -> apperr::Result<*mut c_void> {
unsafe { load_library(c"icu.dll") }
}
#[inline]
pub fn io_error_to_apperr(err: std::io::Error) -> apperr::Error {
unsafe { apperr::Error::new(err.raw_os_error().unwrap_or(0) as u32) }
}
pub fn format_error(err: apperr::Error) -> String {
let errno = err.value() & 0xFFFF;
let mut result = format!("Error {}", errno);
unsafe {
let ptr = libc::strerror(errno as i32);
if !ptr.is_null() {
let msg = CStr::from_ptr(ptr).to_string_lossy();
result.push_str(": ");
result.push_str(&msg);
}
}
result
}
fn errno_to_apperr(no: c_int) -> apperr::Error {
unsafe { apperr::Error::new(no.max(1) as u32) }
}
fn check_int_return(ret: libc::c_int) -> apperr::Result<libc::c_int> {
if ret < 0 {
Err(errno_to_apperr(unsafe { *libc::__errno_location() }))
} else {
Ok(ret)
}
}

524
src/sys/windows.rs Normal file
View file

@ -0,0 +1,524 @@
use crate::helpers::{CoordType, Size};
use crate::{apperr, helpers};
use std::ffi::CStr;
use std::fmt::Write as _;
use std::fs::File;
use std::mem::MaybeUninit;
use std::os::windows::io::FromRawHandle;
use std::ptr::{null, null_mut};
use std::{mem, time};
use windows_sys::Win32::Foundation;
use windows_sys::Win32::Globalization;
use windows_sys::Win32::Storage::FileSystem;
use windows_sys::Win32::System::Console;
use windows_sys::Win32::System::Diagnostics::Debug;
use windows_sys::Win32::System::IO;
use windows_sys::Win32::System::LibraryLoader;
use windows_sys::Win32::System::Memory;
use windows_sys::Win32::System::Threading;
use windows_sys::w;
pub fn preferred_languages() -> Vec<String> {
unsafe {
const LEN: usize = 256;
let mut lang_num = 0;
let mut lang_buf = [const { MaybeUninit::<u16>::uninit() }; LEN];
let mut lang_buf_len = lang_buf.len() as u32;
if Globalization::GetUserPreferredUILanguages(
Globalization::MUI_LANGUAGE_NAME,
&mut lang_num,
lang_buf[0].as_mut_ptr(),
&mut lang_buf_len,
) == 0
|| lang_num == 0
{
return Vec::new();
}
// Drop the terminating double-null character.
lang_buf_len = lang_buf_len.saturating_sub(1);
let mut lang_buf_utf8 = [const { MaybeUninit::<u8>::uninit() }; 3 * LEN];
let lang_buf_utf8_len = Globalization::WideCharToMultiByte(
Globalization::CP_UTF8,
0,
lang_buf[0].as_mut_ptr(),
lang_buf_len as i32,
lang_buf_utf8[0].as_mut_ptr(),
lang_buf_utf8.len() as i32,
null(),
null_mut(),
);
if lang_buf_utf8_len == 0 {
return Vec::new();
}
let result = helpers::str_from_raw_parts_mut(
lang_buf_utf8[0].as_mut_ptr(),
lang_buf_utf8_len as usize,
);
result.make_ascii_lowercase();
result.split_terminator('\0').map(String::from).collect()
}
}
type ReadConsoleInputExW = unsafe extern "system" fn(
h_console_input: Foundation::HANDLE,
lp_buffer: *mut Console::INPUT_RECORD,
n_length: u32,
lp_number_of_events_read: *mut u32,
w_flags: u16,
) -> Foundation::BOOL;
const CONSOLE_READ_NOWAIT: u16 = 0x0002;
struct State {
read_console_input_ex: ReadConsoleInputExW,
stdin: Foundation::HANDLE,
stdout: Foundation::HANDLE,
stdin_cp_old: u32,
stdout_cp_old: u32,
stdin_mode_old: u32,
stdout_mode_old: u32,
leading_surrogate: u16,
inject_resize: bool,
wants_exit: bool,
}
static mut STATE: State = State {
read_console_input_ex: read_console_input_ex_placeholder,
stdin: null_mut(),
stdout: null_mut(),
stdin_cp_old: 0,
stdout_cp_old: 0,
stdin_mode_old: 0,
stdout_mode_old: 0,
leading_surrogate: 0,
inject_resize: false,
wants_exit: false,
};
unsafe extern "system" fn read_console_input_ex_placeholder(
_: Foundation::HANDLE,
_: *mut Console::INPUT_RECORD,
_: u32,
_: *mut u32,
_: u16,
) -> Foundation::BOOL {
panic!();
}
extern "system" fn console_ctrl_handler(_ctrl_type: u32) -> Foundation::BOOL {
unsafe {
STATE.wants_exit = true;
IO::CancelIoEx(STATE.stdin, null());
}
1
}
pub fn init() -> apperr::Result<()> {
unsafe {
let kernel32 = LibraryLoader::GetModuleHandleW(w!("kernel32.dll"));
STATE.read_console_input_ex = get_proc_address(kernel32, c"ReadConsoleInputExW")?;
check_bool_return(Console::SetConsoleCtrlHandler(
Some(console_ctrl_handler),
1,
))?;
STATE.stdin = FileSystem::CreateFileW(
w!("CONIN$"),
Foundation::GENERIC_READ | Foundation::GENERIC_WRITE,
FileSystem::FILE_SHARE_READ | FileSystem::FILE_SHARE_WRITE,
null_mut(),
FileSystem::OPEN_EXISTING,
0,
null_mut(),
);
STATE.stdout = FileSystem::CreateFileW(
w!("CONOUT$"),
Foundation::GENERIC_READ | Foundation::GENERIC_WRITE,
FileSystem::FILE_SHARE_READ | FileSystem::FILE_SHARE_WRITE,
null_mut(),
FileSystem::OPEN_EXISTING,
0,
null_mut(),
);
if STATE.stdin == Foundation::INVALID_HANDLE_VALUE
|| STATE.stdout == Foundation::INVALID_HANDLE_VALUE
{
return Err(get_last_error());
}
STATE.stdin_cp_old = Console::GetConsoleCP();
STATE.stdout_cp_old = Console::GetConsoleOutputCP();
check_bool_return(Console::GetConsoleMode(
STATE.stdin,
&raw mut STATE.stdin_mode_old,
))?;
check_bool_return(Console::GetConsoleMode(
STATE.stdout,
&raw mut STATE.stdout_mode_old,
))?;
check_bool_return(Console::SetConsoleCP(Globalization::CP_UTF8))?;
check_bool_return(Console::SetConsoleOutputCP(Globalization::CP_UTF8))?;
check_bool_return(Console::SetConsoleMode(
STATE.stdin,
Console::ENABLE_WINDOW_INPUT
| Console::ENABLE_EXTENDED_FLAGS
| Console::ENABLE_VIRTUAL_TERMINAL_INPUT,
))?;
check_bool_return(Console::SetConsoleMode(
STATE.stdout,
Console::ENABLE_PROCESSED_OUTPUT
| Console::ENABLE_WRAP_AT_EOL_OUTPUT
| Console::ENABLE_VIRTUAL_TERMINAL_PROCESSING
| Console::DISABLE_NEWLINE_AUTO_RETURN,
))?;
Ok(())
}
}
pub fn deinit() {
unsafe {
Console::SetConsoleCP(STATE.stdin_cp_old);
Console::SetConsoleOutputCP(STATE.stdout_cp_old);
Console::SetConsoleMode(STATE.stdin, STATE.stdin_mode_old);
Console::SetConsoleMode(STATE.stdout, STATE.stdout_mode_old);
}
}
pub fn inject_window_size_into_stdin() {
unsafe {
STATE.inject_resize = true;
}
}
fn get_console_size() -> Option<Size> {
unsafe {
let mut info: Console::CONSOLE_SCREEN_BUFFER_INFOEX = mem::zeroed();
info.cbSize = mem::size_of::<Console::CONSOLE_SCREEN_BUFFER_INFOEX>() as u32;
if Console::GetConsoleScreenBufferInfoEx(STATE.stdout, &mut info) == 0 {
return None;
}
let w = (info.srWindow.Right - info.srWindow.Left + 1).max(1) as CoordType;
let h = (info.srWindow.Bottom - info.srWindow.Top + 1).max(1) as CoordType;
Some(Size {
width: w,
height: h,
})
}
}
/// Reads from stdin.
///
/// Returns `None` if there was an error reading from stdin.
/// Returns `Some("")` if the given timeout was reached.
/// Otherwise, it returns the read, non-empty string.
pub fn read_stdin(timeout: Option<time::Duration>) -> Option<String> {
let mut input_buf = [const { MaybeUninit::<Console::INPUT_RECORD>::uninit() }; 1024];
let mut input_buf_cap = input_buf.len();
let mut utf16_buf = [const { MaybeUninit::<u16>::uninit() }; 1024];
let mut utf16_buf_len = 0;
let mut resize_event = None;
let mut read_more = true;
let mut read_poll = timeout.is_some();
if unsafe { STATE.inject_resize } {
resize_event = get_console_size();
read_poll = true;
unsafe { STATE.inject_resize = false };
}
if unsafe { STATE.leading_surrogate } != 0 {
utf16_buf[0] = MaybeUninit::new(unsafe { STATE.leading_surrogate });
utf16_buf_len = 1;
input_buf_cap -= 1;
unsafe { STATE.leading_surrogate = 0 };
}
if let Some(timeout) = timeout {
let wait_result =
unsafe { Threading::WaitForSingleObject(STATE.stdin, timeout.as_millis() as u32) };
match wait_result {
// Ready to read? Continue with reading below.
// `read_more` is already true to ensure we don't block.
Foundation::WAIT_OBJECT_0 => {}
// Timeout? Skip reading entirely.
Foundation::WAIT_TIMEOUT => read_more = false,
// Error? Tell the caller stdin is broken.
_ => return None,
}
}
// This loops exists, just in case there's events in the input buffer that we aren't interested in.
// It should be rare for this to loop.
while read_more {
let input = unsafe {
// If we had a `inject_resize`, we don't want to block indefinitely for other pending input on startup,
// but are still interested in any other pending input that may be waiting for us.
let flags = if read_poll { CONSOLE_READ_NOWAIT } else { 0 };
let mut read = 0;
let ok = (STATE.read_console_input_ex)(
STATE.stdin,
input_buf[0].as_mut_ptr(),
input_buf_cap as u32,
&mut read,
flags,
);
if ok == 0 || STATE.wants_exit {
return None;
}
&*(&input_buf[..read as usize] as *const _ as *const [Console::INPUT_RECORD])
};
for inp in input {
match inp.EventType as u32 {
Console::KEY_EVENT => {
let event = unsafe { &inp.Event.KeyEvent };
let ch = unsafe { event.uChar.UnicodeChar };
if event.bKeyDown != 0 && ch != 0 {
utf16_buf[utf16_buf_len] = MaybeUninit::new(ch);
utf16_buf_len += 1;
}
}
Console::WINDOW_BUFFER_SIZE_EVENT => {
let event = unsafe { &inp.Event.WindowBufferSizeEvent };
let w = event.dwSize.X as CoordType;
let h = event.dwSize.Y as CoordType;
// Windows is prone to sending broken/useless `WINDOW_BUFFER_SIZE_EVENT`s.
// E.g. starting conhost will emit 3 in a row. Skip rendering in that case.
if w > 0 && h > 0 {
resize_event = Some(Size {
width: w,
height: h,
});
}
}
_ => {}
}
}
read_more = !resize_event.is_some() && utf16_buf_len == 0;
}
const RESIZE_EVENT_FMT_MAX_LEN: usize = 16; // "\x1b[8;65535;65535t"
let resize_event_len = if resize_event.is_some() {
RESIZE_EVENT_FMT_MAX_LEN
} else {
0
};
// +1 to account for a potential `STATE.leading_surrogate`.
let utf8_max_len = (utf16_buf_len + 1) * 3;
let mut text = String::with_capacity(utf8_max_len + resize_event_len);
if let Some(resize_event) = resize_event {
// If I read xterm's documentation correctly, CSI 18 t reports the window size in characters.
// CSI 8 ; height ; width t is the response. Of course, we didn't send the request,
// but we can use this fake response to trigger the editor to resize itself.
_ = write!(
text,
"\x1b[8;{};{}t",
resize_event.height, resize_event.width
);
}
// If the input ends with a lone lead surrogate, we need to remember it for the next read.
if utf16_buf_len > 0 {
unsafe {
let last_char = utf16_buf[utf16_buf_len - 1].assume_init();
if 0xD800 <= last_char && last_char <= 0xDBFF {
STATE.leading_surrogate = last_char;
utf16_buf_len -= 1;
}
}
}
// Convert the remaining input to UTF8, the sane encoding.
if utf16_buf_len > 0 {
unsafe {
let vec = text.as_mut_vec();
let spare = vec.spare_capacity_mut();
let len = Globalization::WideCharToMultiByte(
Globalization::CP_UTF8,
0,
utf16_buf[0].as_ptr(),
utf16_buf_len as i32,
spare.as_mut_ptr() as *mut _,
spare.len() as i32,
null(),
null_mut(),
);
if len > 0 {
vec.set_len(vec.len() + len as usize);
}
}
}
Some(text)
}
pub fn write_stdout(text: &str) {
unsafe {
let mut offset = 0;
while offset < text.len() {
let ptr = text.as_ptr().add(offset);
let write = (text.len() - offset).min(1024 * 1024 * 1024) as u32;
let mut written = 0;
let ok = FileSystem::WriteFile(STATE.stdout, ptr, write, &mut written, null_mut());
offset += written as usize;
if ok == 0 || written == 0 {
break;
}
}
}
}
pub fn open_stdin_if_redirected() -> Option<File> {
unsafe {
let handle = Console::GetStdHandle(Console::STD_INPUT_HANDLE);
match FileSystem::GetFileType(handle) {
FileSystem::FILE_TYPE_DISK | FileSystem::FILE_TYPE_PIPE => {
Some(File::from_raw_handle(handle))
}
_ => None,
}
}
}
pub unsafe fn virtual_reserve(size: usize) -> apperr::Result<*mut u8> {
unsafe {
let mut base = null_mut();
if cfg!(debug_assertions) {
static mut S_BASE_GEN: usize = 0x0000100000000000;
S_BASE_GEN += 0x0000100000000000;
base = S_BASE_GEN as *mut _;
}
check_ptr_return(Memory::VirtualAlloc(
base,
size,
Memory::MEM_RESERVE,
Memory::PAGE_READWRITE,
) as *mut u8)
}
}
pub unsafe fn virtual_release(base: *mut u8, size: usize) {
unsafe {
Memory::VirtualFree(base as *mut _, size, Memory::MEM_RELEASE);
}
}
pub unsafe fn virtual_commit(base: *mut u8, size: usize) -> apperr::Result<()> {
unsafe {
check_ptr_return(Memory::VirtualAlloc(
base as *mut _,
size,
Memory::MEM_COMMIT,
Memory::PAGE_READWRITE,
))
.map(|_| ())
}
}
unsafe fn load_library(name: *const u16) -> apperr::Result<Foundation::HMODULE> {
unsafe {
check_ptr_return(LibraryLoader::LoadLibraryExW(
name,
null_mut(),
LibraryLoader::LOAD_LIBRARY_SEARCH_SYSTEM32,
))
}
}
// It'd be nice to constrain T to std::marker::FnPtr, but that's unstable.
pub unsafe fn get_proc_address<T>(handle: Foundation::HMODULE, name: &CStr) -> apperr::Result<T> {
unsafe {
let ptr = LibraryLoader::GetProcAddress(handle, name.as_ptr() as *const u8);
if let Some(ptr) = ptr {
Ok(mem::transmute_copy(&ptr))
} else {
Err(get_last_error())
}
}
}
pub unsafe fn load_icu() -> apperr::Result<Foundation::HMODULE> {
unsafe { load_library(w!("icu.dll")) }
}
#[cold]
fn get_last_error() -> apperr::Error {
unsafe { gle_to_apperr(Foundation::GetLastError()) }
}
#[inline]
fn gle_to_apperr(gle: u32) -> apperr::Error {
unsafe {
apperr::Error::new(if gle == 0 {
0x8000FFFF
} else {
0x80070000 | gle
})
}
}
#[inline]
pub fn io_error_to_apperr(err: std::io::Error) -> apperr::Error {
gle_to_apperr(err.raw_os_error().unwrap_or(0) as u32)
}
pub fn format_error(err: apperr::Error) -> String {
unsafe {
let mut ptr: *mut u8 = null_mut();
let len = Debug::FormatMessageA(
Debug::FORMAT_MESSAGE_ALLOCATE_BUFFER
| Debug::FORMAT_MESSAGE_FROM_SYSTEM
| Debug::FORMAT_MESSAGE_IGNORE_INSERTS,
null(),
err.value() as u32,
0,
&mut ptr as *mut *mut _ as *mut _,
0,
null_mut(),
);
let mut result = format!("Error {:#08x}", err.value());
if len > 0 {
let msg = helpers::str_from_raw_parts(ptr, len as usize);
let msg = msg.trim_ascii();
let msg = msg.replace(['\r', '\n'], " ");
result.push_str(": ");
result.push_str(&msg);
Foundation::LocalFree(ptr as *mut _);
}
result
}
}
fn check_bool_return(ret: Foundation::BOOL) -> apperr::Result<()> {
if ret == 0 {
Err(get_last_error())
} else {
Ok(())
}
}
fn check_ptr_return<T>(ret: *mut T) -> apperr::Result<*mut T> {
if ret.is_null() {
Err(get_last_error())
} else {
Ok(ret)
}
}

7
src/trust_me_bro.rs Normal file
View file

@ -0,0 +1,7 @@
pub fn this_lifetime_change_is_totally_safe<'a, T: ?Sized>(x: &T) -> &'a T {
unsafe { std::mem::transmute(x) }
}
pub fn this_lifetime_change_is_totally_safe_mut<'a, T: ?Sized>(x: &mut T) -> &'a mut T {
unsafe { std::mem::transmute(x) }
}

2958
src/tui.rs Normal file

File diff suppressed because it is too large Load diff

705
src/ucd.rs Normal file
View file

@ -0,0 +1,705 @@
use crate::helpers::{CoordType, Point};
use crate::memchr::{memchr2, memrchr2};
use crate::ucd_gen::*;
use crate::utf8::Utf8Chars;
use std::cmp::Ordering;
pub trait Document {
fn read_backward(&self, off: usize) -> &[u8];
fn read_forward(&self, off: usize) -> &[u8];
}
impl Document for &[u8] {
fn read_backward(&self, off: usize) -> &[u8] {
let s = *self;
&s[..off.min(s.len())]
}
fn read_forward(&self, off: usize) -> &[u8] {
let s = *self;
&s[off.min(s.len())..]
}
}
#[derive(Clone, Copy, Default)]
pub struct UcdCursor {
/// Offset in bytes within the buffer.
pub offset: usize,
/// Position in the buffer in lines (.y) and grapheme clusters (.x).
/// Line wrapping has NO influence on this.
pub logical_pos: Point,
/// Position in the buffer in laid out rows (.y) and columns (.x).
/// Line wrapping has an influence on this.
pub visual_pos: Point,
/// Horizontal position in visual columns.
/// Line wrapping has NO influence on this and if word wrap is disabled,
/// it's identical to `visual_pos.x`. This is useful for calculating tab widths.
pub column: CoordType,
}
pub struct WrapOpportunity {
absolute_offset: usize,
offset_next_cluster: usize,
props_next_cluster: usize,
logical_pos_x: CoordType,
}
pub struct MeasurementConfig<'doc> {
buffer: &'doc dyn Document,
tab_size: CoordType,
word_wrap_column: CoordType,
cursor: UcdCursor,
}
impl<'doc> MeasurementConfig<'doc> {
pub fn new(buffer: &'doc dyn Document) -> Self {
Self {
buffer,
tab_size: 8,
word_wrap_column: CoordType::MAX,
cursor: UcdCursor::default(),
}
}
pub fn with_tab_size(mut self, tab_size: CoordType) -> Self {
self.tab_size = tab_size;
self
}
pub fn with_word_wrap_column(mut self, word_wrap_column: CoordType) -> Self {
self.word_wrap_column = word_wrap_column;
self
}
pub fn with_cursor(mut self, cursor: UcdCursor) -> Self {
self.cursor = cursor;
self
}
pub fn goto_offset(&mut self, offset: usize) -> UcdCursor {
self.cursor = Self::measure_forward(
self.tab_size,
self.word_wrap_column,
offset,
Point::MAX,
Point::MAX,
self.cursor,
self.buffer,
);
self.cursor
}
pub fn goto_logical(&mut self, logical_target: Point) -> UcdCursor {
self.cursor = Self::measure_forward(
self.tab_size,
self.word_wrap_column,
usize::MAX,
logical_target,
Point::MAX,
self.cursor,
self.buffer,
);
self.cursor
}
pub fn goto_visual(&mut self, visual_target: Point) -> UcdCursor {
self.cursor = Self::measure_forward(
self.tab_size,
self.word_wrap_column,
usize::MAX,
Point::MAX,
visual_target,
self.cursor,
self.buffer,
);
self.cursor
}
pub fn cursor(&self) -> UcdCursor {
self.cursor
}
fn measure_forward(
tab_size: CoordType,
word_wrap_column: CoordType,
offset_target: usize,
logical_target: Point,
visual_target: Point,
cursor: UcdCursor,
buffer: &dyn Document,
) -> UcdCursor {
if cursor.logical_pos >= logical_target || cursor.visual_pos >= visual_target {
return cursor;
}
let mut wrap: Option<WrapOpportunity> = None;
let mut hit: Option<UcdCursor> = None;
let mut absolute_offset = cursor.offset;
let mut logical_pos_x = cursor.logical_pos.x;
let mut logical_pos_y = cursor.logical_pos.y;
let mut visual_pos_x = cursor.visual_pos.x;
let mut visual_pos_y = cursor.visual_pos.y;
let mut column = cursor.column;
let (mut offset_target_x, mut logical_target_x, mut visual_target_x) = Self::recalc_target(
offset_target,
logical_target,
visual_target,
logical_pos_y,
visual_pos_y,
);
'outer: loop {
let chunk = buffer.read_forward(absolute_offset);
let chunk_beg = absolute_offset;
let chunk_end = absolute_offset + chunk.len();
let mut it = Utf8Chars::new(chunk, 0);
let Some(mut ch) = it.next() else {
break;
};
let mut props_next_cluster = ucd_grapheme_cluster_lookup(ch);
loop {
if absolute_offset >= chunk_end {
break;
}
if absolute_offset >= offset_target_x
|| logical_pos_x >= logical_target_x
|| visual_pos_x >= visual_target_x
{
if wrap.is_none() {
break 'outer;
}
hit = Some(UcdCursor {
offset: absolute_offset,
logical_pos: Point {
x: logical_pos_x,
y: logical_pos_y,
},
visual_pos: Point {
x: visual_pos_x,
y: visual_pos_y,
},
column,
});
// Prevent hits on the same line until we encounter a line wrap or explicit newline.
offset_target_x = usize::MAX;
logical_target_x = CoordType::MAX;
visual_target_x = CoordType::MAX;
}
let props_current_cluster = props_next_cluster;
let is_tab = ch == '\t';
let mut offset_next_cluster;
let mut width = 0;
let mut state = 0;
// Figure out the length and width of the rest of the grapheme cluster.
loop {
offset_next_cluster = it.offset();
width += ucd_grapheme_cluster_character_width(props_next_cluster) as CoordType;
let Some(ch_next) = it.next() else {
break;
};
ch = ch_next;
let props_trail = ucd_grapheme_cluster_lookup(ch);
state = ucd_grapheme_cluster_joins(state, props_next_cluster, props_trail);
props_next_cluster = props_trail;
if ucd_grapheme_cluster_joins_done(state) {
break;
}
}
let offset_next_cluster = chunk_beg + offset_next_cluster;
if is_tab {
// Tabs require special handling because they can have a variable width.
width = tab_size - (column % tab_size);
} else {
width = width.min(2);
}
// Hard wrap: Both the logical and visual position advance by one line.
if ucd_grapheme_cluster_is_newline(props_current_cluster) {
// Don't cross the newline if the target is on this line.
// E.g. if the callers asks for column 100 on a 10 column line,
// we'll return with the cursor set to column 10.
if logical_pos_y >= logical_target.y || visual_pos_y >= visual_target.y {
break 'outer;
}
logical_pos_x = 0;
logical_pos_y += 1;
visual_pos_x = 0;
visual_pos_y += 1;
column = 0;
// We moved the logical/visual pos past the newline,
// so we also need to move the offset past it.
absolute_offset = offset_next_cluster;
(offset_target_x, logical_target_x, visual_target_x) = Self::recalc_target(
offset_target,
logical_target,
visual_target,
logical_pos_y,
visual_pos_y,
);
continue;
}
// Line/word-wrap handling.
if word_wrap_column != CoordType::MAX && visual_pos_x + width > word_wrap_column {
// Reset to the last break opportunity, if there was any.
if let Some(ref w) = wrap {
absolute_offset = w.absolute_offset;
it.seek(w.offset_next_cluster);
props_next_cluster = w.props_next_cluster;
logical_pos_x = w.logical_pos_x;
}
// Wrap!
visual_pos_x = 0;
visual_pos_y += 1;
(offset_target_x, logical_target_x, visual_target_x) = Self::recalc_target(
offset_target,
logical_target,
visual_target,
logical_pos_y,
visual_pos_y,
);
wrap = None;
hit = None;
if absolute_offset < chunk_beg {
// We've had to reset to a point before this chunk,
// so we have to re-read the previous contents.
break;
}
continue;
}
// Avoid advancing past the visual target, because `width` can be greater than 1.
if visual_pos_x + width > visual_target_x {
if word_wrap_column == CoordType::MAX || wrap.is_none() {
break 'outer;
}
hit = Some(UcdCursor {
offset: absolute_offset,
logical_pos: Point {
x: logical_pos_x,
y: logical_pos_y,
},
visual_pos: Point {
x: visual_pos_x,
y: visual_pos_y,
},
column,
});
// Prevent hits on the same line until we encounter a line wrap or explicit newline.
offset_target_x = usize::MAX;
logical_target_x = CoordType::MAX;
visual_target_x = CoordType::MAX;
}
absolute_offset = offset_next_cluster;
logical_pos_x += 1;
visual_pos_x += width;
column += width;
if word_wrap_column != CoordType::MAX
&& !ucd_line_break_joins(props_current_cluster, props_next_cluster)
{
if hit.is_some() {
break 'outer;
}
wrap = Some(WrapOpportunity {
absolute_offset,
offset_next_cluster: it.offset(),
props_next_cluster,
logical_pos_x,
});
}
}
}
if visual_pos_x >= word_wrap_column {
visual_pos_x = 0;
visual_pos_y += 1;
}
if let Some(c) = hit {
return c;
}
UcdCursor {
offset: absolute_offset,
logical_pos: Point {
x: logical_pos_x,
y: logical_pos_y,
},
visual_pos: Point {
x: visual_pos_x,
y: visual_pos_y,
},
column,
}
}
#[inline]
fn recalc_target(
offset_target: usize,
logical_target: Point,
visual_target: Point,
logical_pos_y: CoordType,
visual_pos_y: CoordType,
) -> (usize, CoordType, CoordType) {
(
offset_target,
Self::target_column(logical_target, logical_pos_y),
Self::target_column(visual_target, visual_pos_y),
)
}
#[inline]
fn target_column(target: Point, y: CoordType) -> CoordType {
match y.cmp(&target.y) {
Ordering::Less => CoordType::MAX,
Ordering::Equal => target.x,
Ordering::Greater => 0,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum CharClass {
Whitespace,
Newline,
Separator,
Word,
}
const fn construct_classifier(seperators: &[u8]) -> [CharClass; 256] {
let mut classifier = [CharClass::Word; 256];
classifier[b' ' as usize] = CharClass::Whitespace;
classifier[b'\t' as usize] = CharClass::Whitespace;
classifier[b'\n' as usize] = CharClass::Newline;
classifier[b'\r' as usize] = CharClass::Newline;
let mut i = 0;
let len = seperators.len();
while i < len {
let ch = seperators[i];
assert!(ch < 128, "Only ASCII separators are supported.");
classifier[ch as usize] = CharClass::Separator;
i += 1;
}
classifier
}
const WORD_CLASSIFIER: [CharClass; 256] =
construct_classifier(br#"`~!@#$%^&*()-=+[{]}\|;:'",.<>/?"#);
/// Finds the next word boundary given a document cursor offset.
/// Returns the offset of the next word boundary.
pub fn word_forward(doc: &dyn Document, offset: usize) -> usize {
word_navigation(WordForward {
doc,
offset,
chunk: &[],
chunk_off: 0,
})
}
/// The backward version of `word_forward`.
pub fn word_backward(doc: &dyn Document, offset: usize) -> usize {
word_navigation(WordBackward {
doc,
offset,
chunk: &[],
chunk_off: 0,
})
}
/// Word navigation implementation. Matches the behavior of VS Code.
fn word_navigation<T: WordNavigation>(mut nav: T) -> usize {
// First skip one newline, if any.
nav.skip_newline();
// Skip any whitespace.
nav.skip_class(CharClass::Whitespace);
// Skip one word or seperator and take note of the class.
let class = nav.peek(CharClass::Whitespace);
if matches!(class, CharClass::Separator | CharClass::Word) {
nav.next();
let off = nav.offset();
// Continue skipping the same class.
nav.skip_class(class);
// If the class was a separator and we only moved one character,
// continue skipping characters of the word class.
if off == nav.offset() && class == CharClass::Separator {
nav.skip_class(CharClass::Word);
}
}
nav.offset()
}
trait WordNavigation {
fn skip_newline(&mut self);
fn skip_class(&mut self, class: CharClass);
fn peek(&self, default: CharClass) -> CharClass;
fn next(&mut self);
fn offset(&self) -> usize;
}
struct WordForward<'a> {
doc: &'a dyn Document,
offset: usize,
chunk: &'a [u8],
chunk_off: usize,
}
impl WordNavigation for WordForward<'_> {
fn skip_newline(&mut self) {
// We can rely on the fact that the document does not split graphemes across chunks.
// = If there's a newline it's wholly contained in this chunk.
if self.chunk_off < self.chunk.len() && self.chunk[self.chunk_off] == b'\r' {
self.chunk_off += 1;
}
if self.chunk_off < self.chunk.len() && self.chunk[self.chunk_off] == b'\n' {
self.chunk_off += 1;
}
}
fn skip_class(&mut self, class: CharClass) {
'outer: loop {
while self.chunk_off < self.chunk.len() {
if WORD_CLASSIFIER[self.chunk[self.chunk_off] as usize] != class {
break 'outer;
}
self.chunk_off += 1;
}
self.offset += self.chunk.len();
self.chunk = self.doc.read_forward(self.offset);
self.chunk_off = 0;
}
}
fn peek(&self, default: CharClass) -> CharClass {
if self.chunk_off < self.chunk.len() {
WORD_CLASSIFIER[self.chunk[self.chunk_off] as usize]
} else {
default
}
}
fn next(&mut self) {
self.chunk_off += 1;
}
fn offset(&self) -> usize {
self.offset + self.chunk_off
}
}
struct WordBackward<'a> {
doc: &'a dyn Document,
offset: usize,
chunk: &'a [u8],
chunk_off: usize,
}
impl WordNavigation for WordBackward<'_> {
fn skip_newline(&mut self) {
// We can rely on the fact that the document does not split graphemes across chunks.
// = If there's a newline it's wholly contained in this chunk.
if self.chunk_off > 0 && self.chunk[self.chunk_off - 1] == b'\r' {
self.chunk_off -= 1;
}
if self.chunk_off > 0 && self.chunk[self.chunk_off - 1] == b'\n' {
self.chunk_off -= 1;
}
}
fn skip_class(&mut self, class: CharClass) {
'outer: loop {
while self.chunk_off > 0 {
if WORD_CLASSIFIER[self.chunk[self.chunk_off - 1] as usize] != class {
break 'outer;
}
self.chunk_off -= 1;
}
self.offset -= self.chunk.len();
self.chunk = self.doc.read_backward(self.offset);
self.chunk_off = self.chunk.len();
}
}
fn peek(&self, default: CharClass) -> CharClass {
if self.chunk_off > 0 {
WORD_CLASSIFIER[self.chunk[self.chunk_off - 1] as usize]
} else {
default
}
}
fn next(&mut self) {
self.chunk_off -= 1;
}
fn offset(&self) -> usize {
self.offset - self.chunk.len() + self.chunk_off
}
}
pub fn newlines_forward(
text: &[u8],
mut offset: usize,
mut line: CoordType,
line_stop: CoordType,
) -> (usize, CoordType) {
// Leaving the cursor at the beginning of the current line when the limit
// is 0 makes this function behave identical to ucd_newlines_backward.
if line >= line_stop {
return newlines_backward(text, offset, line, line_stop);
}
let len = text.len();
offset = offset.min(len);
loop {
offset = memchr2(b'\r', b'\n', text, offset);
if offset >= len {
break;
}
let ch = text[offset];
offset += 1;
if ch == b'\r' && offset != len && text[offset] == b'\n' {
offset += 1;
}
line += 1;
if line >= line_stop {
break;
}
}
(offset, line)
}
// Seeks to the start of the given line.
// No matter what parameters are given, it only returns an offset at the start of a line.
// Put differently, even if `line == line_stop`, it'll seek backward to the line start.
pub fn newlines_backward(
text: &[u8],
mut offset: usize,
mut line: CoordType,
line_stop: CoordType,
) -> (usize, CoordType) {
offset = offset.min(text.len());
loop {
offset = match memrchr2(b'\r', b'\n', text, offset) {
Some(i) => i,
None => return (0, line),
};
if line <= line_stop {
// +1: Past the newline, at the start of the current line.
return (offset + 1, line);
}
if text[offset] == b'\n' && offset != 0 && text[offset - 1] == b'\r' {
offset -= 1;
}
line -= 1;
}
}
pub fn strip_newline(mut text: &[u8]) -> &[u8] {
// Rust generates surprisingly tight assembly for this.
if text.last() == Some(&b'\n') {
text = &text[..text.len() - 1];
}
if text.last() == Some(&b'\r') {
text = &text[..text.len() - 1];
}
text
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_measure_forward_newline_start() {
let cursor =
MeasurementConfig::new(&"foo\nbar".as_bytes()).goto_visual(Point { x: 0, y: 1 });
assert_eq!(cursor.offset, 4);
assert_eq!(cursor.logical_pos, Point { x: 0, y: 1 });
assert_eq!(cursor.visual_pos, Point { x: 0, y: 1 });
}
#[test]
fn test_measure_forward_clipped_wide_char() {
let cursor = MeasurementConfig::new(&"a😶🌫b".as_bytes()).goto_visual(Point { x: 2, y: 0 });
assert_eq!(cursor.offset, 1);
assert_eq!(cursor.logical_pos, Point { x: 1, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 1, y: 0 });
}
#[test]
fn test_measure_forward_word_wrap() {
// |foo␣ |
// |bar␣ |
// |baz |
let text = "foo bar \nbaz".as_bytes();
let cursor = MeasurementConfig::new(&text)
.with_word_wrap_column(6)
.goto_logical(Point { x: 5, y: 0 });
assert_eq!(cursor.offset, 5);
assert_eq!(cursor.logical_pos, Point { x: 5, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 1, y: 1 });
let mut cfg = MeasurementConfig::new(&text).with_word_wrap_column(6);
let cursor = cfg.goto_visual(Point { x: 5, y: 0 });
assert_eq!(cursor.offset, 4);
assert_eq!(cursor.logical_pos, Point { x: 4, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 0, y: 1 });
let cursor = cfg.goto_visual(Point { x: 0, y: 1 });
assert_eq!(cursor.offset, 4);
assert_eq!(cursor.logical_pos, Point { x: 4, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 0, y: 1 });
let cursor = cfg.goto_visual(Point { x: 100, y: 1 });
assert_eq!(cursor.offset, 8);
assert_eq!(cursor.logical_pos, Point { x: 8, y: 0 });
assert_eq!(cursor.visual_pos, Point { x: 4, y: 1 });
let cursor = cfg.goto_visual(Point { x: 0, y: 2 });
assert_eq!(cursor.offset, 9);
assert_eq!(cursor.logical_pos, Point { x: 0, y: 1 });
assert_eq!(cursor.visual_pos, Point { x: 0, y: 2 });
let cursor = cfg.goto_visual(Point { x: 100, y: 2 });
assert_eq!(cursor.offset, 12);
assert_eq!(cursor.logical_pos, Point { x: 3, y: 1 });
assert_eq!(cursor.visual_pos, Point { x: 3, y: 2 });
}
}

1066
src/ucd_gen.rs Normal file

File diff suppressed because it is too large Load diff

217
src/utf8.rs Normal file
View file

@ -0,0 +1,217 @@
use crate::helpers;
use std::{hint, iter, mem};
#[derive(Clone, Copy)]
pub struct Utf8Chars<'a> {
source: &'a [u8],
offset: usize,
}
impl<'a> Utf8Chars<'a> {
pub fn new(source: &'a [u8], offset: usize) -> Self {
Self { source, offset }
}
pub fn offset(&self) -> usize {
self.offset
}
pub fn seek(&mut self, offset: usize) {
self.offset = offset;
}
#[inline(always)]
fn fffd() -> Option<char> {
// Improves performance by ~5% and reduces code size.
helpers::cold_path();
Some('\u{FFFD}')
}
}
impl Iterator for Utf8Chars<'_> {
type Item = char;
fn next(&mut self) -> Option<Self::Item> {
if self.offset >= self.source.len() {
return None;
}
let c = self.source[self.offset];
self.offset += 1;
// See: https://datatracker.ietf.org/doc/html/rfc3629
// as well as ICU's `utf8.h` for the bitmask approach.
// UTF8-1 = %x00-7F
if (c & 0x80) == 0 {
return Some(c as char);
}
if self.offset >= self.source.len() {
return Self::fffd();
}
let mut cp = c as u32;
if cp < 0xE0 {
// UTF8-2 = %xC2-DF UTF8-tail
if cp < 0xC2 {
return Self::fffd();
}
// The lead byte is 110xxxxx
// -> Strip off the 110 prefix
cp &= !0xE0;
} else if cp < 0xF0 {
// UTF8-3 =
// %xE0 %xA0-BF UTF8-tail
// %xE1-EC UTF8-tail UTF8-tail
// %xED %x80-9F UTF8-tail
// %xEE-EF UTF8-tail UTF8-tail
// This is a pretty neat approach seen in ICU4C, because it's a 1:1 translation of the RFC.
// I don't understand why others don't do the same thing. It's rather performant.
const BITS_80_9F: u8 = 1 << 0b100; // 0x80-9F, aka 0b100xxxxx
const BITS_A0_BF: u8 = 1 << 0b101; // 0xA0-BF, aka 0b101xxxxx
const BITS_BOTH: u8 = BITS_80_9F | BITS_A0_BF;
const LEAD_TRAIL1_BITS: [u8; 16] = [
// v-- lead byte
BITS_A0_BF, // 0xE0
BITS_BOTH, // 0xE1
BITS_BOTH, // 0xE2
BITS_BOTH, // 0xE3
BITS_BOTH, // 0xE4
BITS_BOTH, // 0xE5
BITS_BOTH, // 0xE6
BITS_BOTH, // 0xE7
BITS_BOTH, // 0xE8
BITS_BOTH, // 0xE9
BITS_BOTH, // 0xEA
BITS_BOTH, // 0xEB
BITS_BOTH, // 0xEC
BITS_80_9F, // 0xED
BITS_BOTH, // 0xEE
BITS_BOTH, // 0xEF
];
// The lead byte is 1110xxxx
// -> Strip off the 1110 prefix
cp &= !0xF0;
let t = self.source[self.offset];
if LEAD_TRAIL1_BITS[cp as usize] & (1 << (t >> 5)) == 0 {
return Self::fffd();
}
cp = (cp << 6) | (t as u32 & 0x3F);
self.offset += 1;
if self.offset >= self.source.len() {
return Self::fffd();
}
} else {
// UTF8-4 =
// %xF0 %x90-BF UTF8-tail UTF8-tail
// %xF1-F3 UTF8-tail UTF8-tail UTF8-tail
// %xF4 %x80-8F UTF8-tail UTF8-tail
// This is similar to the above, but with the indices flipped:
// The trail byte is the index and the lead byte mask is the value.
// This is because the split at 0x90 requires more bits than fit into an u8.
const TRAIL1_LEAD_BITS: [u8; 16] = [
// +------ 0xF4 lead
// |+----- 0xF3 lead
// ||+---- 0xF2 lead
// |||+--- 0xF1 lead
// ||||+-- 0xF0 lead
// vvvvv
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, // trail bytes:
0b_00000, //
0b_11110, // 0x80-8F -> 0x80-8F can be preceded by 0xF1-F4
0b_01111, // 0x90-9F -v
0b_01111, // 0xA0-AF -> 0x90-BF can be preceded by 0xF0-F3
0b_01111, // 0xB0-BF -^
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, //
];
// The lead byte *may* be 11110xxx, but could also be e.g. 11111xxx.
// -> Only strip off the 1111 prefix
cp &= !0xF0;
// Now we can verify if it's actually <= 0xF4.
if cp > 4 {
return Self::fffd();
}
let t = self.source[self.offset];
if TRAIL1_LEAD_BITS[(t >> 4) as usize] & (1 << cp) == 0 {
return Self::fffd();
}
cp = (cp << 6) | (t as u32 & 0x3F);
self.offset += 1;
if self.offset >= self.source.len() {
return Self::fffd();
}
// UTF8-tail = %x80-BF
let t = self.source[self.offset] as u32 - 0x80;
if t > 0x3F {
return Self::fffd();
}
cp = (cp << 6) | t;
self.offset += 1;
if self.offset >= self.source.len() {
return Self::fffd();
}
}
unsafe { hint::assert_unchecked(self.offset < self.source.len()) };
// UTF8-tail = %x80-BF
let t = self.source[self.offset] as u32 - 0x80;
if t > 0x3F {
return Self::fffd();
}
cp = (cp << 6) | t;
self.offset += 1;
Some(unsafe { mem::transmute(cp) })
}
}
impl iter::FusedIterator for Utf8Chars<'_> {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_broken_utf8() {
let source = [b'a', 0xED, 0xA0, 0x80, b'b'];
let mut chars = Utf8Chars::new(&source, 0);
let mut offset = 0;
for chunk in source.utf8_chunks() {
for ch in chunk.valid().chars() {
offset += ch.len_utf8();
assert_eq!(chars.next(), Some(ch));
assert_eq!(chars.offset(), offset);
}
if !chunk.invalid().is_empty() {
offset += chunk.invalid().len();
assert_eq!(chars.next(), Some('\u{FFFD}'));
assert_eq!(chars.offset(), offset);
}
}
}
}

319
src/vt.rs Normal file
View file

@ -0,0 +1,319 @@
use core::time;
use crate::memchr::memchr2;
pub enum Token<'parser, 'input> {
Text(&'input str),
Ctrl(char),
Esc(char),
SS3(char),
Csi(&'parser Csi),
Osc { data: &'input str, partial: bool },
Dcs { data: &'input str, partial: bool },
}
#[derive(Clone, Copy)]
pub enum State {
Ground,
Esc,
Ss3,
Csi,
Osc,
Dcs,
OscEsc,
DcsEsc,
}
pub struct Csi {
pub params: [i32; 32],
pub param_count: usize,
pub private_byte: char,
pub final_byte: char,
}
pub struct Parser {
state: State,
// Csi is not part of State, because it allows us
// to more quickly erase and reuse the struct.
csi: Csi,
}
impl Parser {
pub fn new() -> Self {
Self {
state: State::Ground,
csi: Csi {
params: [0; 32],
param_count: 0,
private_byte: '\0',
final_byte: '\0',
},
}
}
/// Suggests a timeout for the next call to `read()`.
///
/// We need this because of the ambiguouity of whether a trailing
/// escape character in an input is starting another escape sequence or
/// is just the result of the user literally pressing the Escape key.
pub fn read_timeout(&mut self) -> Option<std::time::Duration> {
match self.state {
// 100ms is a upper ceiling for a responsive feel. This uses half that,
// under the assumption that a really slow terminal needs equal amounts
// of time for I and O. Realistically though, this could be much lower.
State::Esc => Some(time::Duration::from_millis(50)),
_ => None,
}
}
/// Parses the given input into VT sequences.
///
/// You should call this function even if your `read()`
/// had a timeout (pass an empty string in that case).
pub fn parse<'parser, 'input>(
&'parser mut self,
input: &'input str,
) -> Stream<'parser, 'input> {
Stream {
parser: self,
input,
off: 0,
}
}
}
pub struct Stream<'parser, 'input> {
parser: &'parser mut Parser,
input: &'input str,
off: usize,
}
impl Stream<'_, '_> {
/// Reads and consumes raw bytes from the input.
pub fn read(&mut self, dst: &mut [u8]) -> usize {
let bytes = self.input.as_bytes();
let off = self.off.min(bytes.len());
let len = dst.len().min(bytes.len() - off);
dst[..len].copy_from_slice(&bytes[off..off + len]);
self.off += len;
len
}
/// Parses the next VT sequence from the previously given input.
///
/// Can't implement Iterator, because this is a "lending iterator".
pub fn next(&mut self) -> Option<Token> {
let parser = &mut *self.parser;
let input = self.input;
let bytes = input.as_bytes();
// If the previous input ended with an escape character, `read_timeout()`
// returned `Some(..)` timeout, and if the caller did everything correctly
// and there was indeed a timeout, we should be called with an empty
// input. In that case we'll return the escape as its own token.
if input.is_empty() && matches!(parser.state, State::Esc) {
parser.state = State::Ground;
return Some(Token::Esc('\0'));
}
while self.off < bytes.len() {
match parser.state {
State::Ground => match bytes[self.off] {
0x1b => {
parser.state = State::Esc;
self.off += 1;
}
c @ (0x00..0x20 | 0x7f) => {
self.off += 1;
return Some(Token::Ctrl(c as char));
}
_ => {
let beg = self.off;
while {
self.off += 1;
self.off < bytes.len()
&& bytes[self.off] >= 0x20
&& bytes[self.off] != 0x7f
} {}
return Some(Token::Text(&input[beg..self.off]));
}
},
State::Esc => {
let c = bytes[self.off];
self.off += 1;
match c {
b'[' => {
parser.state = State::Csi;
parser.csi.private_byte = '\0';
parser.csi.final_byte = '\0';
while parser.csi.param_count > 0 {
parser.csi.param_count -= 1;
parser.csi.params[parser.csi.param_count] = 0;
}
}
b']' => {
parser.state = State::Osc;
}
b'O' => {
parser.state = State::Ss3;
}
b'P' => {
parser.state = State::Dcs;
}
c => {
parser.state = State::Ground;
return Some(Token::Esc(c as char));
}
}
}
State::Ss3 => {
parser.state = State::Ground;
let c = bytes[self.off];
self.off += 1;
return Some(Token::SS3(c as char));
}
State::Csi => {
loop {
// If we still have slots left, parse the parameter.
if parser.csi.param_count < parser.csi.params.len() {
let dst = &mut parser.csi.params[parser.csi.param_count];
while self.off < bytes.len()
&& bytes[self.off] >= b'0'
&& bytes[self.off] <= b'9'
{
let v = *dst * 10 + bytes[self.off] as i32 - b'0' as i32;
*dst = v.min(0xffff);
self.off += 1;
}
} else {
// ...otherwise, skip the parameters until we find the final byte.
while self.off < bytes.len()
&& bytes[self.off] >= b'0'
&& bytes[self.off] <= b'9'
{
self.off += 1;
}
}
// Encountered the end of the input before finding the final byte.
if self.off >= bytes.len() {
return None;
}
let c = bytes[self.off];
self.off += 1;
match c {
0x40..=0x7e => {
parser.state = State::Ground;
parser.csi.final_byte = c as char;
if parser.csi.param_count != 0 || parser.csi.params[0] != 0 {
parser.csi.param_count += 1;
}
return Some(Token::Csi(&parser.csi));
}
b';' => parser.csi.param_count += 1,
b'<'..=b'?' => parser.csi.private_byte = c as char,
_ => {}
}
}
}
State::Osc | State::Dcs => {
let beg = self.off;
let mut data;
let mut partial;
loop {
// Find any indication for the end of the OSC/DCS sequence.
self.off = memchr2(b'\x07', b'\x1b', bytes, self.off);
data = &input[beg..self.off];
partial = self.off >= bytes.len();
// Encountered the end of the input before finding the terminator.
if partial {
break;
}
let c = bytes[self.off];
self.off += 1;
if c == 0x1b {
// It's only a string terminator if it's followed by \.
// We're at the end so we're saving the state and will continue next time.
if self.off >= bytes.len() {
parser.state = match parser.state {
State::Osc => State::OscEsc,
_ => State::DcsEsc,
};
partial = true;
break;
}
// False alarm: Not a string terminator.
if bytes[self.off] != b'\\' {
continue;
}
self.off += 1;
}
break;
}
let state = parser.state;
if !partial {
parser.state = State::Ground;
}
return match state {
State::Osc => Some(Token::Osc { data, partial }),
_ => Some(Token::Dcs { data, partial }),
};
}
State::OscEsc | State::DcsEsc => {
// We were processing an OSC/DCS sequence and the last byte was an escape character.
// It's only a string terminator if it's followed by \ (= "\x1b\\").
if bytes[self.off] == b'\\' {
// It was indeed a string terminator and we can now tell the caller about it.
let state = parser.state;
// Consume the terminator (one byte in the previous input and this byte).
parser.state = State::Ground;
self.off += 1;
return match state {
State::OscEsc => Some(Token::Osc {
data: "",
partial: false,
}),
_ => Some(Token::Dcs {
data: "",
partial: false,
}),
};
} else {
// False alarm: Not a string terminator.
// We'll return the escape character as a separate token.
// Processing will continue from the current state (`bytes[self.off]`).
parser.state = match parser.state {
State::OscEsc => State::Osc,
_ => State::Dcs,
};
return match parser.state {
State::Osc => Some(Token::Osc {
data: "\x1b",
partial: true,
}),
_ => Some(Token::Dcs {
data: "\x1b",
partial: true,
}),
};
}
}
}
}
None
}
}

View file

@ -0,0 +1,11 @@
@echo off
rem Avoid linking with vcruntime140.dll by statically linking everything,
rem and then explicitly linking with ucrtbase.dll dynamically.
rem We do this, because vcruntime140.dll is an optional Windows component.
set RUSTFLAGS=-Ctarget-feature=+crt-static -Clink-args=/DEFAULTLIB:ucrt.lib -Clink-args=/NODEFAULTLIB:vcruntime.lib -Clink-args=/NODEFAULTLIB:msvcrt.lib -Clink-args=/NODEFAULTLIB:libucrt.lib
rem The backtrace code for panics in Rust is almost as large as the entire editor.
rem = Huge reduction in binary size by removing all that.
rem cargo build --release -Zbuild-std=std,panic_abort -Zbuild-std-features=panic_immediate_abort %*
cargo build --release %*

380
tools/grapheme-table-gen/Cargo.lock generated Normal file
View file

@ -0,0 +1,380 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]]
name = "anyhow"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34ac096ce696dc2fcabef30516bb13c0a68a11d30131d3df6f04711467681b04"
[[package]]
name = "autocfg"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "bumpalo"
version = "3.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
[[package]]
name = "cc"
version = "1.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31a0499c1dc64f458ad13872de75c0eb7e3fdb0e67964610c914b034fc5956e"
dependencies = [
"shlex",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.39"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825"
dependencies = [
"android-tzdata",
"iana-time-zone",
"js-sys",
"num-traits",
"wasm-bindgen",
"windows-targets",
]
[[package]]
name = "core-foundation-sys"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "either"
version = "1.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
[[package]]
name = "grapheme-table-gen"
version = "0.1.0"
dependencies = [
"anyhow",
"chrono",
"indoc",
"pico-args",
"rayon",
"roxmltree",
]
[[package]]
name = "iana-time-zone"
version = "0.1.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"windows-core",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]]
name = "indoc"
version = "2.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5"
[[package]]
name = "js-sys"
version = "0.3.76"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "log"
version = "0.4.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24"
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "pico-args"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315"
[[package]]
name = "proc-macro2"
version = "1.0.92"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rayon"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "roxmltree"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c20b6793b5c2fa6553b250154b78d6d0db37e72700ae35fad9387a46f487c97"
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "syn"
version = "2.0.91"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d53cbcb5a243bd33b7858b1d7f4aca2153490815872d86d955d6ea29f743c035"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "unicode-ident"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
[[package]]
name = "wasm-bindgen"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
dependencies = [
"cfg-if",
"once_cell",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
dependencies = [
"bumpalo",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.99"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View file

@ -0,0 +1,12 @@
[package]
name = "grapheme-table-gen"
version = "0.1.0"
edition = "2021"
[dependencies]
anyhow = "1.0.95"
chrono = "0.4.39"
indoc = "2.0.5"
pico-args = { version = "0.5.0", features = ["eq-separator"] }
rayon = "1.10.0"
roxmltree = { version = "0.20.0", default-features = false, features = ["std"] }

View file

@ -0,0 +1,850 @@
mod rules;
use crate::rules::{JOIN_RULES_GRAPHEME_CLUSTER, JOIN_RULES_LINE_BREAK};
use anyhow::{bail, Context};
use indoc::writedoc;
use rayon::prelude::*;
use std::collections::HashMap;
use std::fmt::Write as FmtWrite;
use std::io::Write as IoWrite;
use std::ops::RangeInclusive;
use std::path::PathBuf;
type TrieType = u32;
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
enum CharacterWidth {
ZeroWidth,
Narrow,
Wide,
Ambiguous,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
enum ClusterBreak {
Other, // GB999
Extend, // GB9, GB9a -- includes SpacingMark
RI, // GB12, GB13
Prepend, // GB9b
HangulL, // GB6, GB7, GB8
HangulV, // GB6, GB7, GB8
HangulT, // GB6, GB7, GB8
HangulLV, // GB6, GB7, GB8
HangulLVT, // GB6, GB7, GB8
InCBLinker, // GB9c
InCBConsonant, // GB9c
ExtPic, // GB11
ZWJ, // GB9, GB11
// These are intentionally ordered last, as this allows us to
// simplify the ucd_grapheme_cluster_is_newline implementation.
Control, // GB4, GB5
CR, // GB3, GB4, GB5
LF, // GB3, GB4, GB5
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
#[allow(non_camel_case_types)]
enum LineBreak {
Other, // Anything else
// Non-tailorable Line Breaking Classes
WordJoiner, // WJ
ZeroWidthSpace, // ZW
Glue, // GL
Space, // SP
// Break Opportunities
BreakAfter, // BA
BreakBefore, // BB
Hyphen, // HY
// Characters Prohibiting Certain Breaks
ClosePunctuation, // CL
CloseParenthesis_EA, // CP, East Asian
CloseParenthesis_NotEA, // CP, not East Asian
Exclamation, // EX
Inseparable, // IN
Nonstarter, // NS
OpenPunctuation_EA, // OP, East Asian
OpenPunctuation_NotEA, // OP, not East Asian
Quotation, // QU
// Numeric Context
InfixNumericSeparator, // IS
Numeric, // NU
PostfixNumeric, // PO
PrefixNumeric, // PR
SymbolsAllowingBreakAfter, // SY
// Other Characters
Alphabetic, // AL & HL
Ideographic, // ID & EB & EM
}
#[derive(Clone, Default)]
struct Ucd {
description: String,
values: Vec<u32>,
}
#[derive(Clone, Default)]
struct Stage {
values: Vec<u32>,
index: usize,
shift: usize,
mask: usize,
bits: usize,
}
#[derive(Clone, Default)]
struct Trie {
stages: Vec<Stage>,
total_size: usize,
}
#[derive(Clone, Copy, Default)]
enum Language {
#[default]
C,
Rust,
}
#[derive(Default)]
struct Output {
arg_lang: Language,
arg_no_ambiguous: bool,
arg_line_breaks: bool,
ucd: Ucd,
trie: Trie,
rules_gc: [Vec<u32>; 2],
rules_lb: Vec<u32>,
total_size: usize,
}
const HELP: &str = "\
Usage: grapheme-table-gen [options...] <ucd.nounihan.grouped.xml>
-h, --help Prints help information
--lang=<c|rust> Output language (default: c)
--no-ambiguous Treat all ambiguous characters as narrow
--line-breaks Store and expose line break information
";
fn main() -> anyhow::Result<()> {
let mut args = pico_args::Arguments::from_env();
if args.contains(["-h", "--help"]) {
eprint!("{}", HELP);
return Ok(());
}
let mut out = Output {
arg_lang: args.value_from_fn("--lang", |arg| match arg {
"c" => Ok(Language::C),
"rust" => Ok(Language::Rust),
l => bail!("invalid language: \"{}\"", l),
})?,
arg_no_ambiguous: args.contains("--no-ambiguous"),
arg_line_breaks: args.contains("--line-breaks"),
..Default::default()
};
let arg_input = args.free_from_os_str(|s| -> Result<PathBuf, &'static str> { Ok(s.into()) })?;
let arg_remaining = args.finish();
if !arg_remaining.is_empty() {
bail!("unrecognized arguments: {:?}", arg_remaining);
}
let input = std::fs::read_to_string(arg_input)?;
let doc = roxmltree::Document::parse(&input)?;
out.ucd = extract_values_from_ucd(&doc, &out)?;
// Find the best trie configuration over the given block sizes (2^2 - 2^8) and stages (4).
// More stages = Less size. The trajectory roughly follows a+b*c^stages, where c < 1.
// 4 still gives ~30% savings over 3 stages and going beyond 5 gives diminishing returns (<10%).
out.trie = build_best_trie(&out.ucd.values, 2, 8, 4);
// The joinRules above has 2 bits per value. This packs it into 32-bit integers to save space.
out.rules_gc = JOIN_RULES_GRAPHEME_CLUSTER
.map(|t| t.iter().map(|row| prepare_rules_row(row, 2, 3)).collect());
out.rules_lb = JOIN_RULES_LINE_BREAK
.iter()
.map(|row| prepare_rules_row(row, 1, 0))
.collect();
// Each rules item has the same length. Each item is 32 bits = 4 bytes.
out.total_size = out.trie.total_size + out.rules_gc.len() * out.rules_gc[0].len() * 4;
if out.arg_line_breaks {
out.total_size += out.rules_lb.len() * 4;
}
// Run a quick sanity check to ensure that the trie works as expected.
for (cp, &expected) in out.ucd.values.iter().enumerate() {
let mut actual = 0;
for s in &out.trie.stages {
actual = s.values[actual as usize + ((cp >> s.shift) & s.mask)];
}
assert_eq!(
expected, actual,
"trie sanity check failed for U+{:04X}",
cp
);
}
let buf = match out.arg_lang {
Language::C => generate_c(out),
Language::Rust => generate_rust(out),
};
std::io::stdout().write_all(buf.as_bytes())?;
Ok(())
}
impl Output {
fn args(&self) -> String {
let mut buf = String::new();
match self.arg_lang {
Language::C => buf.push_str("--lang=c"),
Language::Rust => buf.push_str("--lang=rust"),
}
if self.arg_no_ambiguous {
buf.push_str(" --no-ambiguous")
}
if self.arg_line_breaks {
buf.push_str(" --line-breaks")
}
buf
}
}
fn generate_c(out: Output) -> String {
let mut buf = String::new();
_ = writedoc!(
buf,
"
// BEGIN: Generated by grapheme-table-gen on {}, from {}, with {}, {} bytes
// clang-format off
",
chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, true),
out.ucd.description,
out.args(),
out.total_size,
);
for stage in &out.trie.stages {
let mut width = 16;
if stage.index != 0 {
width = stage.mask + 1;
}
_ = write!(
buf,
"static const uint{}_t s_stage{}[] = {{",
stage.bits, stage.index
);
for (j, &value) in stage.values.iter().enumerate() {
if j % width == 0 {
buf.push_str("\n ");
}
_ = write!(buf, " 0x{:01$x},", value, stage.bits / 4);
}
buf.push_str("\n};\n");
}
_ = writeln!(
buf,
"static const uint32_t s_grapheme_cluster_join_rules[{}][{}] = {{",
out.rules_gc.len(),
out.rules_gc[0].len()
);
for table in &out.rules_gc {
buf.push_str(" {\n");
for &r in table {
_ = writeln!(buf, " 0b{:032b},", r);
}
buf.push_str(" },\n");
}
buf.push_str("};\n");
if out.arg_line_breaks {
_ = writeln!(
buf,
"static const uint32_t s_line_break_join_rules[{}] = {{",
out.rules_lb.len()
);
for r in &out.rules_lb {
_ = writeln!(buf, " 0b{r:032b},");
}
buf.push_str("};\n");
}
buf.push_str("inline int ucd_grapheme_cluster_lookup(const uint32_t cp)\n{\n");
for stage in &out.trie.stages {
if stage.index == 0 {
_ = writeln!(
buf,
" const uint{}_t s0 = s_stage0[cp >> {}];",
stage.bits, stage.shift,
);
} else {
_ = writeln!(
buf,
" const uint{}_t s{} = s_stage{}[s{} + ((cp >> {}) & {})];",
stage.bits,
stage.index,
stage.index,
stage.index - 1,
stage.shift,
stage.mask,
);
}
}
_ = writeln!(buf, " return s{};", out.trie.stages.len() - 1);
buf.push_str("}\n");
_ = writedoc!(
buf,
"
inline int ucd_grapheme_cluster_joins(const int state, const int lead, const int trail)
{{
const int l = lead & 15;
const int t = trail & 15;
return (s_grapheme_cluster_join_rules[state][l] >> (t * 2)) & 3;
}}
inline bool ucd_grapheme_cluster_joins_done(const int state)
{{
return state == 3;
}}
inline int ucd_grapheme_cluster_character_width(const int val)
{{
return (val >> 4) & 3;
}}
inline bool ucd_grapheme_cluster_is_newline(const int val)
{{
return (val & 15) > {};
}}
",
ClusterBreak::Control as u32,
);
if out.arg_line_breaks {
_ = writedoc!(
buf,
"
inline bool ucd_line_break_joins(const int lead, const int trail)
{{
const int l = lead >> 6;
const int t = trail >> 6;
return (s_line_break_join_rules[l] >> t) & 1;
}}
",
);
}
buf.push_str("// clang-format on\n// END: Generated by grapheme-table-gen\n");
buf
}
fn generate_rust(out: Output) -> String {
let mut buf = String::new();
_ = writeln!(
buf,
"// BEGIN: Generated by grapheme-table-gen on {}, from {}, with {}, {} bytes",
chrono::Utc::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, true),
out.ucd.description,
out.args(),
out.total_size,
);
for stage in &out.trie.stages {
let mut width = 16;
if stage.index != 0 {
width = stage.mask + 1;
}
_ = write!(
buf,
"#[rustfmt::skip]\npub const STAGE{}: [u{}; {}] = [",
stage.index,
stage.bits,
stage.values.len(),
);
for (j, &value) in stage.values.iter().enumerate() {
if j % width == 0 {
buf.push_str("\n ");
}
_ = write!(buf, " 0x{:01$x},", value, stage.bits / 4);
}
buf.push_str("\n];\n");
}
_ = writeln!(
buf,
"#[rustfmt::skip]\npub const GRAPHEME_JOIN_RULES: [[u32; {}]; {}] = [",
out.rules_gc[0].len(),
out.rules_gc.len(),
);
for table in &out.rules_gc {
buf.push_str(" [\n");
for &r in table {
_ = writeln!(buf, " 0b{:032b},", r);
}
buf.push_str(" ],\n");
}
buf.push_str("];\n");
if out.arg_line_breaks {
_ = writeln!(
buf,
"#[rustfmt::skip]\npub const LINE_BREAK_JOIN_RULES: [u32; {}] = [",
out.rules_lb.len(),
);
for r in &out.rules_lb {
_ = writeln!(buf, " 0b{r:032b},");
}
buf.push_str("];\n");
}
_ = writedoc!(
buf,
"
#[inline(always)]
pub fn ucd_grapheme_cluster_lookup(cp: char) -> usize {{
let cp = cp as usize;
",
);
for stage in &out.trie.stages {
if stage.index == 0 {
_ = writeln!(
buf,
" let s = STAGE{}[cp >> {}] as usize;",
stage.index, stage.shift,
);
} else if stage.index != out.trie.stages.len() - 1 {
_ = writeln!(
buf,
" let s = STAGE{}[s + (cp & {})] as usize;",
stage.index, stage.mask,
);
} else {
_ = writeln!(
buf,
" STAGE{}[s + ((cp >> {}) & {})] as usize",
stage.index, stage.shift, stage.mask,
);
}
}
buf.push_str("}\n");
_ = writedoc!(
buf,
"
#[inline(always)]
pub fn ucd_grapheme_cluster_joins(state: u32, lead: usize, trail: usize) -> u32 {{
let l = lead & 15;
let t = trail & 15;
(GRAPHEME_JOIN_RULES[state as usize][l] >> (t * 2)) & 3
}}
#[inline(always)]
pub fn ucd_grapheme_cluster_joins_done(state: u32) -> bool {{
state == 3
}}
#[inline(always)]
pub fn ucd_grapheme_cluster_character_width(val: usize) -> usize {{
(val >> 4) & 3
}}
#[inline(always)]
pub fn ucd_grapheme_cluster_is_newline(val: usize) -> bool {{
(val & 15) > {}
}}
",
ClusterBreak::Control as u32,
);
if out.arg_line_breaks {
_ = writedoc!(
buf,
"
#[inline(always)]
pub fn ucd_line_break_joins(lead: usize, trail: usize) -> bool {{
let l = lead >> 6;
let t = trail >> 6;
((LINE_BREAK_JOIN_RULES[l] >> t) & 1) != 0
}}
",
);
}
buf.push_str("// END: Generated by grapheme-table-gen\n");
buf
}
fn extract_values_from_ucd(doc: &roxmltree::Document, out: &Output) -> anyhow::Result<Ucd> {
let ambiguous_value = if out.arg_no_ambiguous {
CharacterWidth::Narrow
} else {
CharacterWidth::Ambiguous
};
let mut values = vec![
trie_value(
ClusterBreak::Other,
CharacterWidth::Narrow,
LineBreak::Other
);
1114112
];
let ns = "http://www.unicode.org/ns/2003/ucd/1.0";
let root = doc.root_element();
let description = root
.children()
.find(|n| n.has_tag_name((ns, "description")))
.context("missing ucd description")?;
let repertoire = root
.children()
.find(|n| n.has_tag_name((ns, "repertoire")))
.context("missing ucd repertoire")?;
let description = description.text().unwrap_or_default().to_string();
for group in repertoire.children().filter(|n| n.is_element()) {
const DEFAULT_ATTRIBUTES: UcdAttributes = UcdAttributes {
general_category: "",
line_break: "",
grapheme_cluster_break: "",
indic_conjunct_break: "",
extended_pictographic: "",
east_asian: "",
};
let group_attributes = extract_attributes(&group, &DEFAULT_ATTRIBUTES);
for char in group.children().filter(|n| n.is_element()) {
let char_attributes = extract_attributes(&char, &group_attributes);
let range = extract_range(&char);
let mut cb = match char_attributes.grapheme_cluster_break {
"XX" => ClusterBreak::Other, // Anything else
// We ignore GB3 which demands that CR × LF do not break apart, because
// * these control characters won't normally reach our text storage
// * otherwise we're in a raw write mode and historically conhost stores them in separate cells
"CR" => ClusterBreak::CR, // Carriage Return
"LF" => ClusterBreak::LF, // Line Feed
"CN" => ClusterBreak::Control, // Control
"EX" | "SM" => ClusterBreak::Extend, // Extend, SpacingMark
"PP" => ClusterBreak::Prepend, // Prepend
"ZWJ" => ClusterBreak::ZWJ, // Zero Width Joiner
"RI" => ClusterBreak::RI, // Regional Indicator
"L" => ClusterBreak::HangulL, // Hangul Syllable Type L
"V" => ClusterBreak::HangulV, // Hangul Syllable Type V
"T" => ClusterBreak::HangulT, // Hangul Syllable Type T
"LV" => ClusterBreak::HangulLV, // Hangul Syllable Type LV
"LVT" => ClusterBreak::HangulLVT, // Hangul Syllable Type LVT
_ => bail!(
"Unrecognized GCB {:?} for U+{:04X} to U+{:04X}",
char_attributes.grapheme_cluster_break,
range.start(),
range.end()
),
};
if char_attributes.extended_pictographic == "Y" {
// Currently every single Extended_Pictographic codepoint happens to be GCB=XX.
// This is fantastic for us because it means we can stuff it into the ClusterBreak enum
// and treat it as an alias of EXTEND, but with the special GB11 properties.
if cb != ClusterBreak::Other {
bail!(
"Unexpected GCB {:?} with ExtPict=Y for U+{:04X} to U+{:04X}",
char_attributes.grapheme_cluster_break,
range.start(),
range.end()
);
}
cb = ClusterBreak::ExtPic;
}
cb = match char_attributes.indic_conjunct_break {
"None" | "Extend" => cb,
"Linker" => ClusterBreak::InCBLinker,
"Consonant" => ClusterBreak::InCBConsonant,
_ => bail!(
"Unrecognized InCB {:?} for U+{:04X} to U+{:04X}",
char_attributes.indic_conjunct_break,
range.start(),
range.end()
),
};
let mut width = match char_attributes.east_asian {
"N" | "Na" | "H" => CharacterWidth::Narrow, // Half-width, Narrow, Neutral
"F" | "W" => CharacterWidth::Wide, // Wide, Full-width
"A" => ambiguous_value, // Ambiguous
_ => bail!(
"Unrecognized ea {:?} for U+{:04X} to U+{:04X}",
char_attributes.east_asian,
range.start(),
range.end()
),
};
// There's no "ea" attribute for "zero width" so we need to do that ourselves. This matches:
// Me: Mark, enclosing
// Mn: Mark, non-spacing
// Cf: Control, format
match char_attributes.general_category {
"Cf" if cb == ClusterBreak::Control => {
// A significant portion of Cf characters are not just gc=Cf (= commonly considered zero-width),
// but also GCB=CN (= does not join). This is a bit of a problem for terminals,
// because they don't support zero-width graphemes, as zero-width columns can't exist.
// So, we turn all of them into Extend, which is roughly how wcswidth() would treat them.
cb = ClusterBreak::Extend;
width = CharacterWidth::ZeroWidth;
}
"Me" | "Mn" | "Cf" => {
width = CharacterWidth::ZeroWidth;
}
_ => {}
};
let lb = if out.arg_line_breaks {
let lb_ea = matches!(char_attributes.east_asian, "F" | "W" | "H");
match char_attributes.line_break {
"WJ" => LineBreak::WordJoiner,
"ZW" => LineBreak::ZeroWidthSpace,
"GL" => LineBreak::Glue,
"SP" => LineBreak::Space,
"BA" => LineBreak::BreakAfter,
"BB" => LineBreak::BreakBefore,
"HY" => LineBreak::Hyphen,
"CL" => LineBreak::ClosePunctuation,
"CP" if lb_ea => LineBreak::CloseParenthesis_EA,
"CP" => LineBreak::CloseParenthesis_NotEA,
"EX" => LineBreak::Exclamation,
"IN" => LineBreak::Inseparable,
"NS" => LineBreak::Nonstarter,
"OP" if lb_ea => LineBreak::OpenPunctuation_EA,
"OP" => LineBreak::OpenPunctuation_NotEA,
"QU" => LineBreak::Quotation,
"IS" => LineBreak::InfixNumericSeparator,
"NU" => LineBreak::Numeric,
"PO" => LineBreak::PostfixNumeric,
"PR" => LineBreak::PrefixNumeric,
"SY" => LineBreak::SymbolsAllowingBreakAfter,
"AL" | "HL" => LineBreak::Alphabetic,
"ID" | "EB" | "EM" => LineBreak::Ideographic,
_ => LineBreak::Other,
}
} else {
LineBreak::Other
};
values[range].fill(trie_value(cb, width, lb));
}
}
// U+00AD: Soft Hyphen
// A soft hyphen is a hint that a word break is allowed at that position.
// By default, the glyph is supposed to be invisible, and only if
// a word break occurs, the text renderer should display a hyphen.
// A terminal does not support computerized typesetting, but unlike the other
// gc=Cf cases we give it a Narrow width, because that matches wcswidth().
values[0x00AD] = trie_value_mod_width(values[0x00AD], CharacterWidth::Narrow);
// U+2500 to U+257F: Box Drawing block
// U+2580 to U+259F: Block Elements block
// By default, CharacterWidth.Ambiguous, but by convention .Narrow in terminals.
//
// Most of these characters are LineBreak.Other, but some are actually LineBreak.Alphabetic.
// But to us this doesn't really matter much, because it doesn't make much sense anyway that
// a light double dash is "alphabetic" while a light triple dash is not.
values[0x2500..=0x259F].fill(trie_value(
ClusterBreak::Other,
CharacterWidth::Narrow,
LineBreak::Other,
));
// U+FE0F Variation Selector-16 is used to turn unqualified Emojis into qualified ones.
// By convention, this turns them from being ambiguous width (= narrow) into wide ones.
// We achieve this here by explicitly giving this codepoint a wide width.
// Later down below we'll clamp width back to <= 2.
//
// U+FE0F actually has a LineBreak property of CM (Combining Mark),
// but for us that's equivalent to Other.
values[0xFE0F] = trie_value_mod_width(values[0xFE0F], CharacterWidth::Wide);
Ok(Ucd {
description,
values,
})
}
struct UcdAttributes<'a> {
general_category: &'a str,
line_break: &'a str,
grapheme_cluster_break: &'a str,
indic_conjunct_break: &'a str,
extended_pictographic: &'a str,
east_asian: &'a str,
}
fn extract_attributes<'a>(
node: &'a roxmltree::Node,
default: &'a UcdAttributes,
) -> UcdAttributes<'a> {
UcdAttributes {
general_category: node.attribute("gc").unwrap_or(default.general_category),
line_break: node.attribute("lb").unwrap_or(default.line_break),
grapheme_cluster_break: node
.attribute("GCB")
.unwrap_or(default.grapheme_cluster_break),
indic_conjunct_break: node
.attribute("InCB")
.unwrap_or(default.indic_conjunct_break),
extended_pictographic: node
.attribute("ExtPict")
.unwrap_or(default.extended_pictographic),
east_asian: node.attribute("ea").unwrap_or(default.east_asian),
}
}
fn extract_range(node: &roxmltree::Node) -> RangeInclusive<usize> {
let (first, last) = match node.attribute("cp") {
Some(val) => {
let cp = usize::from_str_radix(val, 16).unwrap();
(cp, cp)
}
None => (
usize::from_str_radix(node.attribute("first-cp").unwrap_or("0"), 16).unwrap(),
usize::from_str_radix(node.attribute("last-cp").unwrap_or("0"), 16).unwrap(),
),
};
first..=last
}
fn trie_value(cb: ClusterBreak, width: CharacterWidth, lb: LineBreak) -> TrieType {
let cb = cb as TrieType;
let width = (width as TrieType) << 4;
let lb = (lb as TrieType) << 6;
cb | width | lb
}
fn trie_value_mod_width(value: TrieType, width: CharacterWidth) -> TrieType {
let value = value & !(3 << 4); // mask out the width bits
let width = (width as TrieType) << 4;
value | width
}
fn build_best_trie(
uncompressed: &[TrieType],
min_shift: usize,
max_shift: usize,
stages: usize,
) -> Trie {
let depth = stages - 1;
let delta = max_shift - min_shift + 1;
let total = delta.pow(depth as u32);
let mut tasks = Vec::new();
for i in 0..total {
let mut shifts = vec![0; depth];
let mut index = i;
for s in &mut shifts {
*s = min_shift + (index % delta);
index /= delta;
}
tasks.push(shifts);
}
tasks
.par_iter()
.map(|shifts| build_trie(uncompressed.to_vec(), shifts))
.min_by_key(|t| t.total_size)
.unwrap()
}
fn build_trie(mut uncompressed: Vec<TrieType>, shifts: &[usize]) -> Trie {
let mut cumulative_shift = 0;
let mut stages = Vec::new();
for &shift in shifts.iter() {
let chunk_size = 1 << shift;
let mut cache = HashMap::new();
let mut compressed = Vec::new();
let mut offsets = Vec::new();
for off in (0..uncompressed.len()).step_by(chunk_size) {
let chunk = &uncompressed[off..off + chunk_size.min(uncompressed.len() - off)];
let offset = cache.entry(chunk).or_insert_with(|| {
if let Some(existing) = find_existing(&compressed, chunk) {
existing as TrieType
} else {
let overlap = measure_overlap(&compressed, chunk);
compressed.extend_from_slice(&chunk[overlap..]);
(compressed.len() - chunk.len()) as TrieType
}
});
offsets.push(*offset);
}
stages.push(Stage {
values: compressed,
index: shifts.len() - stages.len(),
shift: cumulative_shift,
mask: chunk_size - 1,
bits: 0,
});
uncompressed = offsets;
cumulative_shift += shift;
}
stages.push(Stage {
values: uncompressed,
index: 0,
shift: cumulative_shift,
mask: usize::MAX,
bits: 0,
});
stages.reverse();
for stage in stages.iter_mut() {
let max_val = stage.values.iter().max().cloned().unwrap_or(0);
stage.bits = match max_val {
0..0x100 => 8,
0x100..0x10000 => 16,
_ => 32,
};
}
let total_size: usize = stages
.iter()
.map(|stage| (stage.bits / 8) * stage.values.len())
.sum();
Trie { stages, total_size }
}
fn find_existing(haystack: &[TrieType], needle: &[TrieType]) -> Option<usize> {
haystack
.windows(needle.len())
.position(|window| window == needle)
}
fn measure_overlap(prev: &[TrieType], next: &[TrieType]) -> usize {
(0..prev.len().min(next.len()))
.rev()
.find(|&i| prev[prev.len() - i..] == next[..i])
.unwrap_or(0)
}
fn prepare_rules_row(row: &[i32], bit_width: usize, non_joiner_value: i32) -> u32 {
row.iter().enumerate().fold(0u32, |acc, (trail, &value)| {
let value = if value < 0 { non_joiner_value } else { value };
acc | ((value as u32) << (trail * bit_width))
})
}

View file

@ -0,0 +1,279 @@
// Used as an indicator in our rules for ÷ ("does not join").
// Underscore is one of the few characters that are permitted as an identifier,
// are monospace in most fonts and also visually distinct from the digits.
const X: i32 = -1;
// The following rules are based on the Grapheme Cluster Boundaries section of Unicode Standard Annex #29,
// but slightly modified to allow for use with a plain MxN lookup table.
//
// Break at the start and end of text, unless the text is empty.
// GB1: ~ sot ÷ Any
// GB2: ~ Any ÷ eot
// Handled by our ucd_* functions.
//
// Do not break between a CR and LF. Otherwise, break before and after controls.
// GB3: ✓ CR × LF
// GB4: ✓ (Control | CR | LF) ÷
// GB5: ✓ ÷ (Control | CR | LF)
//
// Do not break Hangul syllable or other conjoining sequences.
// GB6: ✓ L × (L | V | LV | LVT)
// GB7: ✓ (LV | V) × (V | T)
// GB8: ✓ (LVT | T) × T
//
// Do not break before extending characters or ZWJ.
// GB9: ✓ × (Extend | ZWJ)
//
// Do not break before SpacingMarks, or after Prepend characters.
// GB9a: ✓ × SpacingMark
// GB9b: ✓ Prepend ×
//
// Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker.
// GB9c: ~ \p{InCB=Linker} × \p{InCB=Consonant}
// × \p{InCB=Linker}
// modified from
// \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* × \p{InCB=Consonant}
// because this has almost the same effect from what I can tell for most text, and greatly simplifies our design.
//
// Do not break within emoji modifier sequences or emoji zwj sequences.
// GB11: ~ ZWJ × \p{Extended_Pictographic} modified from \p{Extended_Pictographic} Extend* ZWJ × \p{Extended_Pictographic}
// because this allows us to use LUTs, while working for most valid text.
//
// Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point.
// GB12: ~ sot (RI RI)* RI × RI
// GB13: ~ [^RI] (RI RI)* RI × RI
// the lookup table we generate supports RIs via something akin to RI ÷ RI × RI ÷ RI, but the corresponding
// grapheme cluster algorithm doesn't count them. It would need to be updated to recognize and special-case RIs.
//
// Otherwise, break everywhere.
// GB999: ✓ Any ÷ Any
//
// This is a great reference for the resulting table:
// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.html
#[rustfmt::skip]
pub const JOIN_RULES_GRAPHEME_CLUSTER: [[[i32; 16]; 16]; 2] = [
// Base table
[
/* ↓ leading → trailing codepoint */
/* | Other | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | Control | CR | LF | */
/* Other | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* Extend | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* RI | */ [X /* | */, 0 /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* Prepend | */ [0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* HangulL | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, 0 /* | */, X /* | */, 0 /* | */, 0 /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* HangulV | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* HangulT | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* HangulLV | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* HangulLVT | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* InCBLinker | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, 0 /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* InCBConsonant | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* ExtPic | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* ZWJ | */ [X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, 0 /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */],
/* Control | */ [X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */],
/* CR | */ [X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */],
/* LF | */ [X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */],
],
// Once we have encountered a Regional Indicator pair we'll enter this table.
// It's a copy of the base table, but instead of RI × RI, we're RI ÷ RI.
[
/* ↓ leading → trailing codepoint */
/* | Other | CR | LF | Control | Extend | RI | Prepend | HangulL | HangulV | HangulT | HangulLV | HangulLVT | InCBLinker | InCBConsonant | ExtPic | ZWJ | */
/* Other | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* CR | */ [X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */],
/* LF | */ [X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */],
/* Control | */ [X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */],
/* Extend | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* RI | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* Prepend | */ [0 /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */, 0 /* | */],
/* HangulL | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, 0 /* | */, X /* | */, 0 /* | */, 0 /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* HangulV | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* HangulT | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* HangulLV | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* HangulLVT | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* InCBLinker | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, 0 /* | */, X /* | */, 0 /* | */],
/* InCBConsonant | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* ExtPic | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, 0 /* | */],
/* ZWJ | */ [X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 0 /* | */, X /* | */, 0 /* | */, 0 /* | */],
],
];
// The following rules are based on Unicode Standard Annex #14: Line Breaking Properties,
// but heavily modified to allow for use with lookup tables.
//
// NOTE: If you convert these rules into a lookup table, you must apply them in reverse order.
// This is because the rules are ordered from most to least important (e.g. LB8 overrides LB18).
//
// Resolve line breaking classes:
// LB1: Assign a line breaking class [...].
// ✗ Unicode does that for us via the "lb" attribute.
//
// Start and end of text:
// LB2: Never break at the start of text.
// ~ Functionality not needed.
// LB3: Always break at the end of text.
// ~ Functionality not needed.
//
// Mandatory breaks:
// LB4: Always break after hard line breaks.
// ~ Handled by our ucd_* functions.
// LB5: Treat CR followed by LF, as well as CR, LF, and NL as hard line breaks.
// ~ Handled by our ucd_* functions.
// LB6: Do not break before hard line breaks.
// ~ Handled by our ucd_* functions.
//
// Explicit breaks and non-breaks:
// LB7: Do not break before spaces or zero width space.
// ✗ It's way simpler to treat spaces as if they always break.
// LB8: Break before any character following a zero-width space, even if one or more spaces intervene.
// ~ ZW ÷ modified from ZW SP* ÷ because it's not worth being this anal about accuracy here.
// LB8a: Do not break after a zero width joiner.
// ~ Our ucd_* functions never break within grapheme clusters.
//
// Combining marks:
// LB9: Do not break a combining character sequence; treat it as if it has the line breaking class of the base character in all of the following rules. Treat ZWJ as if it were CM.
// ~ Our ucd_* functions never break within grapheme clusters.
// LB10: Treat any remaining combining mark or ZWJ as AL.
// ✗ To be honest, I'm not entirely sure, I understand the implications of this rule.
//
// Word joiner:
// LB11: Do not break before or after Word joiner and related characters.
// ✓ × WJ
// ✓ WJ ×
//
// Non-breaking characters:
// LB12: Do not break after NBSP and related characters.
// ✓ GL ×
// LB12a: Do not break before NBSP and related characters, except after spaces and hyphens.
// ✓ [^SP BA HY] × GL
//
// Opening and closing:
// LB13: Do not break before ']' or '!' or '/', even after spaces.
// ✓ × CL
// ✓ × CP
// ✓ × EX
// ✓ × SY
// LB14: Do not break after '[', even after spaces.
// ~ OP × modified from OP SP* × just because it's simpler. It would be nice to address this.
// LB15a: Do not break after an unresolved initial punctuation that lies at the start of the line, after a space, after opening punctuation, or after an unresolved quotation mark, even after spaces.
// ✗ Not implemented. Seemed too complex for little gain?
// LB15b: Do not break before an unresolved final punctuation that lies at the end of the line, before a space, before a prohibited break, or before an unresolved quotation mark, even after spaces.
// ✗ Not implemented. Seemed too complex for little gain?
// LB15c: Break before a decimal mark that follows a space, for instance, in 'subtract .5'.
// ~ SP ÷ IS modified from SP ÷ IS NU because this fits neatly with LB15d.
// LB15d: Otherwise, do not break before ';', ',', or '.', even after spaces.
// ✓ × IS
// LB16: Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces.
// ✗ Not implemented. Could be useful in the future, but its usefulness seemed limited to me.
// LB17: Do not break within '——', even with intervening spaces.
// ✗ Not implemented. Terminal applications nor code use em-dashes much anyway.
//
// Spaces:
// LB18: Break after spaces.
// ✗ Implemented because we didn't implement LB7.
//
// Special case rules:
// LB19: Do not break before non-initial unresolved quotation marks, such as ' ” ' or ' " ', nor after non-final unresolved quotation marks, such as ' “ ' or ' " '.
// ~ × QU modified from × [ QU - \p{Pi} ]
// ~ QU × modified from [ QU - \p{Pf} ] ×
// We implement the Unicode 16.0 instead of 16.1 rules, because it's simpler and allows us to use a LUT.
// LB19a: Unless surrounded by East Asian characters, do not break either side of any unresolved quotation marks.
// ✗ [^$EastAsian] × QU
// ✗ × QU ( [^$EastAsian] | eot )
// ✗ QU × [^$EastAsian]
// ✗ ( sot | [^$EastAsian] ) QU ×
// Same as LB19.
// LB20: Break before and after unresolved CB.
// ✗ We break by default. Unicode inline objects are super irrelevant in a terminal in either case.
// LB20a: Do not break after a word-initial hyphen.
// ✗ Not implemented. Seemed not worth the hassle as the window will almost always be >1 char wide.
// LB21: Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana, and other non-starters, or after acute accents.
// ✓ × BA
// ✓ × HY
// ✓ × NS
// ✓ BB ×
// LB21a: Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew.
// ✗ Not implemented. Perhaps in the future.
// LB21b: Do not break between Solidus and Hebrew letters.
// ✗ Not implemented. Perhaps in the future.
// LB22: Do not break before ellipses.
// ✓ × IN
//
// Numbers:
// LB23: Do not break between digits and letters.
// ✓ (AL | HL) × NU
// ✓ NU × (AL | HL)
// LB23a: Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
// ✓ PR × (ID | EB | EM)
// ✓ (ID | EB | EM) × PO
// LB24: Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix.
// ✓ (PR | PO) × (AL | HL)
// ✓ (AL | HL) × (PR | PO)
// LB25: Do not break numbers:
// ~ CL × PO modified from NU ( SY | IS )* CL × PO
// ~ CP × PO modified from NU ( SY | IS )* CP × PO
// ~ CL × PR modified from NU ( SY | IS )* CL × PR
// ~ CP × PR modified from NU ( SY | IS )* CP × PR
// ~ ( NU | SY | IS ) × PO modified from NU ( SY | IS )* × PO
// ~ ( NU | SY | IS ) × PR modified from NU ( SY | IS )* × PR
// ~ PO × OP modified from PO × OP NU
// ~ PO × OP modified from PO × OP IS NU
// ✓ PO × NU
// ~ PR × OP modified from PR × OP NU
// ~ PR × OP modified from PR × OP IS NU
// ✓ PR × NU
// ✓ HY × NU
// ✓ IS × NU
// ~ ( NU | SY | IS ) × NU modified from NU ( SY | IS )* × NU
// Most were simplified because the cases this additionally allows don't matter much here.
//
// Korean syllable blocks
// LB26: Do not break a Korean syllable.
// ✗ Our ucd_* functions never break within grapheme clusters.
// LB27: Treat a Korean Syllable Block the same as ID.
// ✗ Our ucd_* functions never break within grapheme clusters.
//
// Finally, join alphabetic letters into words and break everything else.
// LB28: Do not break between alphabetics ("at").
// ✓ (AL | HL) × (AL | HL)
// LB28a: Do not break inside the orthographic syllables of Brahmic scripts.
// ✗ Our ucd_* functions never break within grapheme clusters.
// LB29: Do not break between numeric punctuation and alphabetics ("e.g.").
// ✓ IS × (AL | HL)
// LB30: Do not break between letters, numbers, or ordinary symbols and opening or closing parentheses.
// ✓ (AL | HL | NU) × [OP-$EastAsian]
// ✓ [CP-$EastAsian] × (AL | HL | NU)
// LB30a: Break between two regional indicator symbols if and only if there are an even number of regional indicators preceding the position of the break.
// ✗ Our ucd_* functions never break within grapheme clusters.
// LB30b: Do not break between an emoji base (or potential emoji) and an emoji modifier.
// ✗ Our ucd_* functions never break within grapheme clusters.
// LB31: Break everywhere else.
// ✗ Our default behavior.
#[rustfmt::skip]
pub const JOIN_RULES_LINE_BREAK: [[i32; 24]; 24] = [
/* ↓ leading → trailing codepoint */
/* | Other | WordJoiner | ZeroWidthSpace | Glue | Space | BreakAfter | BreakBefore | Hyphen | ClosePunctuation | CloseParenthesis_EA | CloseParenthesis_NotEA | Exclamation | Inseparable | Nonstarter | OpenPunctuation_EA | OpenPunctuation_NotEA | Quotation | InfixNumericSeparator | Numeric | PostfixNumeric | PrefixNumeric | SymbolsAllowingBreakAfter | Alphabetic | Ideographic | */
/* Other | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, X /* | */],
/* WordJoiner | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* ZeroWidthSpace | */ [X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */, X /* | */],
/* Glue | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* Space | */ [X /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, X /* | */],
/* BreakAfter | */ [X /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, X /* | */],
/* BreakBefore | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* Hyphen | */ [X /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, X /* | */],
/* ClosePunctuation | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */],
/* CloseParenthesis_EA | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */],
/* CloseParenthesis_NotEA | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */],
/* Exclamation | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, X /* | */],
/* Inseparable | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, X /* | */],
/* Nonstarter | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, X /* | */, 1 /* | */, X /* | */, X /* | */],
/* OpenPunctuation_EA | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* OpenPunctuation_NotEA | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* Quotation | */ [1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* InfixNumericSeparator | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */],
/* Numeric | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */],
/* PostfixNumeric | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, X /* | */],
/* PrefixNumeric | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */],
/* SymbolsAllowingBreakAfter | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */],
/* Alphabetic | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */],
/* Ideographic | */ [X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, 1 /* | */, X /* | */, X /* | */, 1 /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, 1 /* | */, X /* | */, X /* | */],
];