Some minor framebuffer optimizations & comments

2025-07-03 22:43:22 +00:00 · 2025-04-09 16:55:13 +02:00 · 2025-04-09 16:55:13 +02:00 · 2cdbf773aa
commit 2cdbf773aa
parent cee02c45b1
4 changed files with 83 additions and 34 deletions
--- a/benches/lib.rs
+++ b/benches/lib.rs
@ -1,7 +1,7 @@
-use criterion::{Criterion, Throughput, criterion_group, criterion_main};
+use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
 use edit::helpers::*;
-use edit::ucd::MeasurementConfig;
-use std::hint::black_box;
+use edit::memchr;
+use edit::ucd;

 fn bench(c: &mut Criterion) {
    let reference = concat!(
@ -13,21 +13,31 @@ fn bench(c: &mut Criterion) {
    let buffer = reference.repeat(10);
    let bytes = buffer.as_bytes();

-    let mut group = c.benchmark_group("ucd");
+    let mut group = c.benchmark_group("ucd::MeasurementConfig::goto_logical");
    group.throughput(Throughput::Bytes(bytes.len() as u64));
-    group.bench_function("MeasurementConfig::goto_logical", |b| {
-        b.iter(|| black_box(MeasurementConfig::new(&bytes).goto_logical(Point::MAX)))
+    group.bench_function("basic", |b| {
+        b.iter(|| ucd::MeasurementConfig::new(&bytes).goto_logical(Point::MAX))
    });
-    group.bench_function("MeasurementConfig::goto_logical with word wrap", |b| {
+    group.bench_function("word_wrap", |b| {
        b.iter(|| {
-            black_box(
-                MeasurementConfig::new(&bytes)
+            ucd::MeasurementConfig::new(&bytes)
                .with_word_wrap_column(50)
-                    .goto_logical(Point::MAX),
-            )
+                .goto_logical(Point::MAX)
        })
    });
    group.finish();
+
+    let mut group = c.benchmark_group("memchr::memchr2");
+    let mut buffer = [0u8; 8192];
+    for &size in &[0usize, 8, 64, 4096] {
+        group.throughput(Throughput::Bytes(size as u64 + 1));
+        group.bench_with_input(BenchmarkId::from_parameter(size), &size, |b, &size| {
+            buffer.fill(b'a');
+            buffer[size] = b'\n';
+            b.iter(|| memchr::memchr2(b'\n', b'\r', &buffer[..size], 0));
+        });
+    }
+    group.finish();
 }

 criterion_group!(benches, bench);
--- a/src/framebuffer.rs
+++ b/src/framebuffer.rs
@ -632,34 +632,67 @@ impl Bitmap {
    }

    fn alpha_blend(dst: u32, src: u32) -> u32 {
-        let src_r = Self::srgb_to_linear(src & 0xff);
-        let src_g = Self::srgb_to_linear((src >> 8) & 0xff);
-        let src_b = Self::srgb_to_linear((src >> 16) & 0xff);
-        let src_a = (src >> 24) as f32 / 255.0f32;
+        let src_a = (src >> 24) as f32 * (1.0 / 255.0);
+        let src_b = Self::srgb_to_linear(src >> 16);
+        let src_g = Self::srgb_to_linear(src >> 8);
+        let src_r = Self::srgb_to_linear(src);

-        let dst_r = Self::srgb_to_linear(dst & 0xff);
-        let dst_g = Self::srgb_to_linear((dst >> 8) & 0xff);
-        let dst_b = Self::srgb_to_linear((dst >> 16) & 0xff);
-        let dst_a = (dst >> 24) as f32 / 255.0f32;
+        let dst_a = (dst >> 24) as f32 * (1.0 / 255.0);
+        let dst_b = Self::srgb_to_linear(dst >> 16);
+        let dst_g = Self::srgb_to_linear(dst >> 8);
+        let dst_r = Self::srgb_to_linear(dst);

-        let out_a = src_a + dst_a * (1.0f32 - src_a);
-        let out_r = (src_r * src_a + dst_r * dst_a * (1.0f32 - src_a)) / out_a;
-        let out_g = (src_g * src_a + dst_g * dst_a * (1.0f32 - src_a)) / out_a;
-        let out_b = (src_b * src_a + dst_b * dst_a * (1.0f32 - src_a)) / out_a;
+        let out_a = src_a + dst_a * (1.0 - src_a);
+        // The formula is technically:
+        //   (src_bgr * src_a + dst_bgr * dst_a * (1 - src_a)) / out_a
+        // but we can merge the division of out_a with the two preceding terms.
+        // This saves us a bunch of operations that cannot be optimized away otherwise.
+        let out_a_inv = 1.0 / out_a;
+        let src_mul = src_a * out_a_inv;
+        let dst_mul = dst_a * (1.0 - src_a) * out_a_inv;
+        let out_b = src_b * src_mul + dst_b * dst_mul;
+        let out_g = src_g * src_mul + dst_g * dst_mul;
+        let out_r = src_r * src_mul + dst_r * dst_mul;

-        (((out_a * 255.0f32) as u32) << 24)
-            | (Self::linear_to_srgb(out_b) << 16)
-            | (Self::linear_to_srgb(out_g) << 8)
-            | Self::linear_to_srgb(out_r)
+        let out_b = Self::linear_to_srgb(out_b);
+        let out_g = Self::linear_to_srgb(out_g);
+        let out_r = Self::linear_to_srgb(out_r);
+
+        (((out_a * 255.0f32) as u32) << 24) | (out_b << 16) | (out_g << 8) | out_r
    }

    fn srgb_to_linear(c: u32) -> f32 {
-        let fc = c as f32 / 255.0f32;
-        if fc <= 0.04045f32 {
-            fc / 12.92f32
-        } else {
-            ((fc + 0.055f32) / 1.055f32).powf(2.4f32)
-        }
+        // Generated using:
+        // ```rs
+        // let fc = c as f32 / 255.0;
+        // if fc <= 0.04045 {
+        //     fc / 12.92
+        // } else {
+        //     ((fc + 0.055) / 1.055).powf(2.4)
+        // }
+        // ```
+        // I'd love to use hex floats, but for some reason Rust maintainers decided against it...
+        #[rustfmt::skip]
+        #[allow(clippy::excessive_precision)]
+        const LUT: [f32; 256] = [
+            0.0000000000, 0.0003035270, 0.0006070540, 0.0009105810, 0.0012141080, 0.0015176350, 0.0018211619, 0.0021246888, 0.0024282159, 0.0027317430, 0.0030352699, 0.0033465356, 0.0036765069, 0.0040247170, 0.0043914421, 0.0047769533,
+            0.0051815170, 0.0056053917, 0.0060488326, 0.0065120910, 0.0069954102, 0.0074990317, 0.0080231922, 0.0085681248, 0.0091340570, 0.0097212177, 0.0103298230, 0.0109600937, 0.0116122449, 0.0122864870, 0.0129830306, 0.0137020806,
+            0.0144438436, 0.0152085144, 0.0159962922, 0.0168073755, 0.0176419523, 0.0185002182, 0.0193823613, 0.0202885624, 0.0212190095, 0.0221738834, 0.0231533647, 0.0241576303, 0.0251868572, 0.0262412224, 0.0273208916, 0.0284260381,
+            0.0295568332, 0.0307134409, 0.0318960287, 0.0331047624, 0.0343398079, 0.0356013142, 0.0368894450, 0.0382043645, 0.0395462364, 0.0409151986, 0.0423114114, 0.0437350273, 0.0451862030, 0.0466650836, 0.0481718220, 0.0497065634,
+            0.0512694679, 0.0528606549, 0.0544802807, 0.0561284944, 0.0578054339, 0.0595112406, 0.0612460710, 0.0630100295, 0.0648032799, 0.0666259527, 0.0684781820, 0.0703601092, 0.0722718611, 0.0742135793, 0.0761853904, 0.0781874284,
+            0.0802198276, 0.0822827145, 0.0843762159, 0.0865004659, 0.0886556059, 0.0908417329, 0.0930589810, 0.0953074843, 0.0975873619, 0.0998987406, 0.1022417471, 0.1046164930, 0.1070231125, 0.1094617173, 0.1119324341, 0.1144353822,
+            0.1169706732, 0.1195384338, 0.1221387982, 0.1247718409, 0.1274376959, 0.1301364899, 0.1328683347, 0.1356333494, 0.1384316236, 0.1412633061, 0.1441284865, 0.1470272839, 0.1499598026, 0.1529261619, 0.1559264660, 0.1589608639,
+            0.1620294005, 0.1651322246, 0.1682693958, 0.1714410931, 0.1746473908, 0.1778884083, 0.1811642349, 0.1844749898, 0.1878207624, 0.1912016720, 0.1946178079, 0.1980693042, 0.2015562356, 0.2050787061, 0.2086368501, 0.2122307271,
+            0.2158605307, 0.2195262313, 0.2232279778, 0.2269658893, 0.2307400703, 0.2345506549, 0.2383976579, 0.2422811985, 0.2462013960, 0.2501583695, 0.2541521788, 0.2581829131, 0.2622507215, 0.2663556635, 0.2704978585, 0.2746773660,
+            0.2788943350, 0.2831487954, 0.2874408960, 0.2917706966, 0.2961383164, 0.3005438447, 0.3049873710, 0.3094689548, 0.3139887452, 0.3185468316, 0.3231432438, 0.3277781308, 0.3324515820, 0.3371636569, 0.3419144452, 0.3467040956,
+            0.3515326977, 0.3564002514, 0.3613068759, 0.3662526906, 0.3712377846, 0.3762622178, 0.3813261092, 0.3864295185, 0.3915725648, 0.3967553079, 0.4019778669, 0.4072403014, 0.4125427008, 0.4178851545, 0.4232677519, 0.4286905527,
+            0.4341537058, 0.4396572411, 0.4452012479, 0.4507858455, 0.4564110637, 0.4620770514, 0.4677838385, 0.4735315442, 0.4793202281, 0.4851499796, 0.4910208881, 0.4969330430, 0.5028865933, 0.5088814497, 0.5149177909, 0.5209956765,
+            0.5271152258, 0.5332764983, 0.5394796133, 0.5457245708, 0.5520114899, 0.5583404899, 0.5647116303, 0.5711249113, 0.5775805116, 0.5840784907, 0.5906189084, 0.5972018838, 0.6038274169, 0.6104956269, 0.6172066331, 0.6239604354,
+            0.6307572126, 0.6375969648, 0.6444797516, 0.6514056921, 0.6583748460, 0.6653873324, 0.6724432111, 0.6795425415, 0.6866854429, 0.6938719153, 0.7011020184, 0.7083759308, 0.7156936526, 0.7230552435, 0.7304608822, 0.7379105687,
+            0.7454043627, 0.7529423237, 0.7605246305, 0.7681512833, 0.7758223414, 0.7835379243, 0.7912980318, 0.7991028428, 0.8069523573, 0.8148466945, 0.8227858543, 0.8307699561, 0.8387991190, 0.8468732834, 0.8549926877, 0.8631572723,
+            0.8713672161, 0.8796223402, 0.8879231811, 0.8962693810, 0.9046613574, 0.9130986929, 0.9215820432, 0.9301108718, 0.9386858940, 0.9473065734, 0.9559735060, 0.9646862745, 0.9734454751, 0.9822505713, 0.9911022186, 1.0000000000,
+        ];
+        LUT[(c & 0xff) as usize]
    }

    fn linear_to_srgb(c: f32) -> u32 {
--- a/src/ucd.rs
+++ b/src/ucd.rs
@ -483,6 +483,7 @@ impl<'doc> MeasurementConfig<'doc> {
        }
    }

+    #[inline]
    fn calc_target_x(target: Point, pos_y: CoordType) -> CoordType {
        match pos_y.cmp(&target.y) {
            std::cmp::Ordering::Less => CoordType::MAX,
--- a/src/utf8.rs
+++ b/src/utf8.rs
@ -35,6 +35,8 @@ impl<'a> Utf8Chars<'a> {
        self.offset < self.source.len()
    }

+    // I found that on mixed 50/50 English/Non-English text,
+    // performance actually suffers when this gets inlined.
    #[cold]
    fn next_slow(&mut self, c: u8) -> char {
        // See: https://datatracker.ietf.org/doc/html/rfc3629
@ -197,6 +199,9 @@ impl<'a> Utf8Chars<'a> {
 impl Iterator for Utf8Chars<'_> {
    type Item = char;

+    // At opt-level="s", this function doesn't get inlined,
+    // but performance greatly suffers in that case.
+    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
        if self.offset >= self.source.len() {
            return None;