case mimicking improvements

2 separate literal string implementations - lazy and precomputed to avoid string clone no longer limited to ascii added examples bench sprinkled must_use
Fogapod · Feb 26, 2024 · 9ca79ea · 9ca79ea
1 parent b799e60
commit 9ca79ea
Show file tree

Hide file tree

Showing 14 changed files with 396 additions and 178 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -69,6 +69,10 @@ required-features = ["cli"]
 name = "accents"
 harness = false
 
+[[bench]]
+name = "examples"
+harness = false
+
 [[bench]]
 name = "literal_string"
 harness = false
diff --git a/benches/accents.rs b/benches/accents.rs
@@ -23,6 +23,7 @@ fn accents(c: &mut Criterion) {
     let lines = read_sample_file_lines();
 
     let mut g = c.benchmark_group("accents");
+    g.sampling_mode(criterion::SamplingMode::Linear);
 
     for name in [
         "original", "literal", "any", "weights", "upper", "lower", "concat",
@@ -32,7 +33,7 @@ fn accents(c: &mut Criterion) {
         g.bench_function(name, |b| {
             b.iter(|| {
                 for line in &lines {
-                    accent.say_it(line, 0);
+                    let _ = accent.say_it(line, 0);
                 }
             })
         });

diff --git a/benches/examples.rs b/benches/examples.rs
@@ -0,0 +1,55 @@
+use criterion::{criterion_group, criterion_main, Criterion};
+use sayit::Accent;
+use std::{fs, path::PathBuf};
+
+pub fn read_accent(filename: &PathBuf) -> Accent {
+    let content = fs::read_to_string(filename).expect("reading accent definition");
+    ron::from_str::<Accent>(&content)
+        .unwrap_or_else(|_| panic!("parsing accent {}", filename.display()))
+}
+
+pub fn read_sample_file() -> String {
+    fs::read_to_string("tests/sample_text.txt").expect("reading sample text")
+}
+
+pub fn read_sample_file_lines() -> Vec<String> {
+    read_sample_file()
+        .lines()
+        .filter(|&l| !(l.is_empty() || l.eq(" :")))
+        .map(|s| s.to_owned())
+        .collect()
+}
+
+fn examples(c: &mut Criterion) {
+    let lines = read_sample_file_lines();
+
+    let mut g = c.benchmark_group("examples");
+    g.sampling_mode(criterion::SamplingMode::Linear);
+
+    for entry in fs::read_dir("examples").unwrap() {
+        let path = entry.unwrap().path();
+
+        if !path.is_file() {
+            continue;
+        }
+
+        if !path.extension().is_some_and(|ext| ext == "ron") {
+            continue;
+        }
+
+        let accent = read_accent(&path);
+        let accent_name = path.file_stem().unwrap().to_string_lossy();
+
+        g.bench_function(accent_name, |b| {
+            b.iter(|| {
+                for line in &lines {
+                    let _ = accent.say_it(line, 0);
+                }
+            })
+        });
+    }
+    g.finish();
+}
+
+criterion_group!(benches, examples);
+criterion_main!(benches);
diff --git a/benches/literal_string.rs b/benches/literal_string.rs
@@ -1,7 +1,5 @@
-use criterion::criterion_group;
-use criterion::criterion_main;
-use criterion::Criterion;
-use sayit::utils::LiteralString;
+use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
+use sayit::utils::{LiteralString, PrecomputedLiteral};
 use std::fs;
 
 pub fn read_sample_file() -> String {
@@ -19,43 +17,26 @@ fn read_sample_words() -> Vec<String> {
         .collect()
 }
 
-// this is 100 times slower than _fast test
-fn literal_string_slow(c: &mut Criterion) {
+fn literal_string(c: &mut Criterion) {
     let mut g = c.benchmark_group("literal_string");
-    g.sample_size(500);
-
-    g.bench_function("creation", |b| {
-        let words = read_sample_words();
-
-        b.iter(|| {
-            for word in &words {
-                let _ = LiteralString::from(word.as_str());
-            }
-        })
-    });
-    g.finish();
-}
-
-fn literal_string_fast(c: &mut Criterion) {
-    let mut g = c.benchmark_group("literal_string");
-    g.sample_size(300);
+    g.sampling_mode(SamplingMode::Linear);
 
     g.bench_function("mimic_case", |b| {
         let words = read_sample_words();
-        let strings: Vec<LiteralString> = words
+        let strings: Vec<PrecomputedLiteral> = words
             .iter()
-            .map(|w| LiteralString::from(w.as_str()))
+            .map(|w| PrecomputedLiteral::new(w.to_string()))
             .collect();
         let reversed_words: Vec<String> = words.into_iter().rev().collect();
 
         b.iter(|| {
             for (string, word) in strings.iter().zip(&reversed_words) {
-                let _ = string.mimic_ascii_case(word);
+                let _ = string.mimic_case_action(word);
             }
         })
     });
     g.finish();
 }
 
-criterion_group!(benches, literal_string_slow, literal_string_fast);
+criterion_group!(benches, literal_string);
 criterion_main!(benches);
diff --git a/src/accent.rs b/src/accent.rs
@@ -42,6 +42,7 @@ impl Accent {
     }
 
     /// Walks rules for given intensity from top to bottom and applies them
+    #[must_use]
     pub fn say_it<'a>(&self, text: &'a str, intensity: u64) -> Cow<'a, str> {
         // Go from the end and pick first intensity that is less or eaual to requested. This is
         // guaranteed to return something because base intensity 0 is always present at the bottom
@@ -101,7 +102,7 @@ mod tests {
                 "".to_owned(),
                 Pass::new(vec![
                     ("(?-i)[a-z]".to_string(), Literal::new_boxed("e")),
-                    ("[A-Z]".to_string(), Literal::new_boxed("E")),
+                    ("(?-i)[A-Z]".to_string(), Literal::new_boxed("E")),
                 ])
                 .unwrap(),
             )],

diff --git a/src/deserialize.rs b/src/deserialize.rs
@@ -1,4 +1,8 @@
-use crate::{pass::Pass, tag::Tag, utils::runtime_format_single_value};
+use crate::{
+    pass::Pass,
+    tag::Tag,
+    utils::{runtime_format_single_value, PrecomputedLiteral},
+};
 use std::{fmt, marker::PhantomData};
 
 use serde::{
@@ -75,7 +79,7 @@ where
 }
 
 impl<'de> Deserialize<'de> for Any {
-    fn deserialize<D>(deserializer: D) -> Result<Any, D::Error>
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
     where
         D: Deserializer<'de>,
     {
@@ -87,6 +91,17 @@ impl<'de> Deserialize<'de> for Any {
     }
 }
 
+impl<'de> Deserialize<'de> for PrecomputedLiteral {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: Deserializer<'de>,
+    {
+        let s = String::deserialize(deserializer)?;
+
+        Ok(Self::new(s))
+    }
+}
+
 impl TryFrom<SortedMap<u64, Box<dyn Tag>, false>> for Weights {
     type Error = WeightsError;
 

diff --git a/src/intensity.rs b/src/intensity.rs
@@ -62,6 +62,7 @@ impl Intensity {
     }
 
     /// Runs all inner passes against text
+    #[must_use]
     pub fn apply<'a>(&self, text: &'a str) -> Cow<'a, str> {
         self.passes.iter().fold(Cow::Borrowed(text), |acc, pass| {
             Cow::Owned(pass.apply(&acc).into_owned())

diff --git a/src/lib.rs b/src/lib.rs
@@ -131,8 +131,7 @@ pub mod pass;
 pub mod tag;
 pub mod tag_impls;
 
-// pub for bench
-#[doc(hidden)]
+#[doc(hidden)] // pub for bench
 pub mod utils;
 
 #[cfg(feature = "deserialize")]

diff --git a/src/match.rs b/src/match.rs
@@ -1,21 +1,51 @@
+use std::ops::Range;
+
 use regex_automata::util::captures::Captures;
 
-use crate::utils::LiteralString;
+use crate::utils::{LazyLiteral, LiteralString};
 
 /// Holds [`regex_automata::util::captures::Captures`] and full input
 #[derive(Debug)]
 pub struct Match<'a> {
-    pub captures: Captures,
-    pub input: &'a str,
+    pub(crate) captures: Captures,
+    pub(crate) input: &'a str,
 }
 
 impl<'a> Match<'a> {
+    /// # Safety
+    ///
+    /// Constructing with invalid Captures will cause UB in [`Match::get_range`] and
+    /// [`Match::get_match`]
+    pub unsafe fn new(captures: Captures, input: &'a str) -> Self {
+        Self { captures, input }
+    }
+
+    /// Returns full match range (regex group 0)
+    #[inline]
+    pub fn get_range(&self) -> Range<usize> {
+        // SAFETY: Match is guaranteed to be created from valid Captures and input or via unsafe
+        //         constructor
+        unsafe { self.captures.get_match().unwrap_unchecked() }.range()
+    }
+
     /// Returns full match (regex group 0)
+    #[inline]
     pub fn get_match(&self) -> &'a str {
-        &self.input[self.captures.get_match().expect("this matched").range()]
+        // SAFETY: Match is guaranteed to be created from valid Captures and input or via unsafe
+        //         constructor
+        unsafe { self.input.get_unchecked(self.get_range()) }
+    }
+
+    pub fn get_captures(&self) -> &Captures {
+        &self.captures
+    }
+
+    pub fn get_input(&self) -> &'a str {
+        self.input
     }
 
     /// Uses regex interpolation syntax to use current match in template
+    #[must_use]
     pub fn interpolate(&self, template: &str) -> String {
         let mut dst = String::new();
 
@@ -26,7 +56,12 @@ impl<'a> Match<'a> {
     }
 
     /// Tries to match string case for current match
-    pub fn mimic_ascii_case(&self, template: &str) -> String {
-        LiteralString::from(template).mimic_ascii_case(self.get_match())
+    #[must_use]
+    pub fn mimic_case(&self, template: String) -> String {
+        let len = self.get_range().len();
+        let literal = LazyLiteral::new(template, len);
+        let action = literal.mimic_case_action(self.get_match());
+
+        literal.handle_mimic_action(action)
     }
 }
diff --git a/src/pass.rs b/src/pass.rs
@@ -16,6 +16,7 @@ pub struct Pass {
 }
 
 // skips 20 pages of debug output of `multi_regex` field
+#[allow(clippy::missing_fields_in_debug)]
 impl fmt::Debug for Pass {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         f.debug_struct("Pass")
@@ -83,6 +84,7 @@ impl Pass {
     }
 
     /// Produces string with all non-overlapping regexes replaced by corresponding tags
+    #[must_use]
     pub fn apply<'a>(&self, text: &'a str) -> Cow<'a, str> {
         let all_captures: Vec<_> = self.multi_regex.captures_iter(text).collect();
 
@@ -94,7 +96,10 @@ impl Pass {
         let mut output = String::with_capacity(text.len());
 
         for caps in all_captures {
-            let caps_match = caps.get_match().expect("this matched");
+            // SAFETY: these captures come from matches. The only way this can fail is if they were
+            //         created manually with Captures::empty()
+            let caps_match = unsafe { caps.get_match().unwrap_unchecked() };
+
             let range = caps_match.range();
             let tag = &self.tags[caps_match.pattern()];