Skip to content

Commit

Permalink
case mimicking improvements
Browse files Browse the repository at this point in the history
2 separate literal string implementations - lazy and precomputed to avoid string clone
no longer limited to ascii
added examples bench
sprinkled must_use
  • Loading branch information
Fogapod committed Feb 26, 2024
1 parent b799e60 commit 9ca79ea
Show file tree
Hide file tree
Showing 14 changed files with 396 additions and 178 deletions.
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ required-features = ["cli"]
name = "accents"
harness = false

[[bench]]
name = "examples"
harness = false

[[bench]]
name = "literal_string"
harness = false
3 changes: 2 additions & 1 deletion benches/accents.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ fn accents(c: &mut Criterion) {
let lines = read_sample_file_lines();

let mut g = c.benchmark_group("accents");
g.sampling_mode(criterion::SamplingMode::Linear);

for name in [
"original", "literal", "any", "weights", "upper", "lower", "concat",
Expand All @@ -32,7 +33,7 @@ fn accents(c: &mut Criterion) {
g.bench_function(name, |b| {
b.iter(|| {
for line in &lines {
accent.say_it(line, 0);
let _ = accent.say_it(line, 0);
}
})
});
Expand Down
55 changes: 55 additions & 0 deletions benches/examples.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
use criterion::{criterion_group, criterion_main, Criterion};
use sayit::Accent;
use std::{fs, path::PathBuf};

pub fn read_accent(filename: &PathBuf) -> Accent {
let content = fs::read_to_string(filename).expect("reading accent definition");
ron::from_str::<Accent>(&content)
.unwrap_or_else(|_| panic!("parsing accent {}", filename.display()))
}

pub fn read_sample_file() -> String {
fs::read_to_string("tests/sample_text.txt").expect("reading sample text")
}

pub fn read_sample_file_lines() -> Vec<String> {
read_sample_file()
.lines()
.filter(|&l| !(l.is_empty() || l.eq(" :")))
.map(|s| s.to_owned())
.collect()
}

fn examples(c: &mut Criterion) {
let lines = read_sample_file_lines();

let mut g = c.benchmark_group("examples");
g.sampling_mode(criterion::SamplingMode::Linear);

for entry in fs::read_dir("examples").unwrap() {
let path = entry.unwrap().path();

if !path.is_file() {
continue;
}

if !path.extension().is_some_and(|ext| ext == "ron") {
continue;
}

let accent = read_accent(&path);
let accent_name = path.file_stem().unwrap().to_string_lossy();

g.bench_function(accent_name, |b| {
b.iter(|| {
for line in &lines {
let _ = accent.say_it(line, 0);
}
})
});
}
g.finish();
}

criterion_group!(benches, examples);
criterion_main!(benches);
35 changes: 8 additions & 27 deletions benches/literal_string.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
use criterion::criterion_group;
use criterion::criterion_main;
use criterion::Criterion;
use sayit::utils::LiteralString;
use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
use sayit::utils::{LiteralString, PrecomputedLiteral};
use std::fs;

pub fn read_sample_file() -> String {
Expand All @@ -19,43 +17,26 @@ fn read_sample_words() -> Vec<String> {
.collect()
}

// this is 100 times slower than _fast test
fn literal_string_slow(c: &mut Criterion) {
fn literal_string(c: &mut Criterion) {
let mut g = c.benchmark_group("literal_string");
g.sample_size(500);

g.bench_function("creation", |b| {
let words = read_sample_words();

b.iter(|| {
for word in &words {
let _ = LiteralString::from(word.as_str());
}
})
});
g.finish();
}

fn literal_string_fast(c: &mut Criterion) {
let mut g = c.benchmark_group("literal_string");
g.sample_size(300);
g.sampling_mode(SamplingMode::Linear);

g.bench_function("mimic_case", |b| {
let words = read_sample_words();
let strings: Vec<LiteralString> = words
let strings: Vec<PrecomputedLiteral> = words
.iter()
.map(|w| LiteralString::from(w.as_str()))
.map(|w| PrecomputedLiteral::new(w.to_string()))
.collect();
let reversed_words: Vec<String> = words.into_iter().rev().collect();

b.iter(|| {
for (string, word) in strings.iter().zip(&reversed_words) {
let _ = string.mimic_ascii_case(word);
let _ = string.mimic_case_action(word);
}
})
});
g.finish();
}

criterion_group!(benches, literal_string_slow, literal_string_fast);
criterion_group!(benches, literal_string);
criterion_main!(benches);
3 changes: 2 additions & 1 deletion src/accent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ impl Accent {
}

/// Walks rules for given intensity from top to bottom and applies them
#[must_use]
pub fn say_it<'a>(&self, text: &'a str, intensity: u64) -> Cow<'a, str> {
// Go from the end and pick first intensity that is less or eaual to requested. This is
// guaranteed to return something because base intensity 0 is always present at the bottom
Expand Down Expand Up @@ -101,7 +102,7 @@ mod tests {
"".to_owned(),
Pass::new(vec![
("(?-i)[a-z]".to_string(), Literal::new_boxed("e")),
("[A-Z]".to_string(), Literal::new_boxed("E")),
("(?-i)[A-Z]".to_string(), Literal::new_boxed("E")),
])
.unwrap(),
)],
Expand Down
19 changes: 17 additions & 2 deletions src/deserialize.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
use crate::{pass::Pass, tag::Tag, utils::runtime_format_single_value};
use crate::{
pass::Pass,
tag::Tag,
utils::{runtime_format_single_value, PrecomputedLiteral},
};
use std::{fmt, marker::PhantomData};

use serde::{
Expand Down Expand Up @@ -75,7 +79,7 @@ where
}

impl<'de> Deserialize<'de> for Any {
fn deserialize<D>(deserializer: D) -> Result<Any, D::Error>
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
Expand All @@ -87,6 +91,17 @@ impl<'de> Deserialize<'de> for Any {
}
}

impl<'de> Deserialize<'de> for PrecomputedLiteral {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;

Ok(Self::new(s))
}
}

impl TryFrom<SortedMap<u64, Box<dyn Tag>, false>> for Weights {
type Error = WeightsError;

Expand Down
1 change: 1 addition & 0 deletions src/intensity.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ impl Intensity {
}

/// Runs all inner passes against text
#[must_use]
pub fn apply<'a>(&self, text: &'a str) -> Cow<'a, str> {
self.passes.iter().fold(Cow::Borrowed(text), |acc, pass| {
Cow::Owned(pass.apply(&acc).into_owned())
Expand Down
3 changes: 1 addition & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,7 @@ pub mod pass;
pub mod tag;
pub mod tag_impls;

// pub for bench
#[doc(hidden)]
#[doc(hidden)] // pub for bench
pub mod utils;

#[cfg(feature = "deserialize")]
Expand Down
47 changes: 41 additions & 6 deletions src/match.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,51 @@
use std::ops::Range;

use regex_automata::util::captures::Captures;

use crate::utils::LiteralString;
use crate::utils::{LazyLiteral, LiteralString};

/// Holds [`regex_automata::util::captures::Captures`] and full input
#[derive(Debug)]
pub struct Match<'a> {
pub captures: Captures,
pub input: &'a str,
pub(crate) captures: Captures,
pub(crate) input: &'a str,
}

impl<'a> Match<'a> {
/// # Safety
///
/// Constructing with invalid Captures will cause UB in [`Match::get_range`] and
/// [`Match::get_match`]
pub unsafe fn new(captures: Captures, input: &'a str) -> Self {
Self { captures, input }
}

/// Returns full match range (regex group 0)
#[inline]
pub fn get_range(&self) -> Range<usize> {
// SAFETY: Match is guaranteed to be created from valid Captures and input or via unsafe
// constructor
unsafe { self.captures.get_match().unwrap_unchecked() }.range()
}

/// Returns full match (regex group 0)
#[inline]
pub fn get_match(&self) -> &'a str {
&self.input[self.captures.get_match().expect("this matched").range()]
// SAFETY: Match is guaranteed to be created from valid Captures and input or via unsafe
// constructor
unsafe { self.input.get_unchecked(self.get_range()) }
}

pub fn get_captures(&self) -> &Captures {
&self.captures
}

pub fn get_input(&self) -> &'a str {
self.input
}

/// Uses regex interpolation syntax to use current match in template
#[must_use]
pub fn interpolate(&self, template: &str) -> String {
let mut dst = String::new();

Expand All @@ -26,7 +56,12 @@ impl<'a> Match<'a> {
}

/// Tries to match string case for current match
pub fn mimic_ascii_case(&self, template: &str) -> String {
LiteralString::from(template).mimic_ascii_case(self.get_match())
#[must_use]
pub fn mimic_case(&self, template: String) -> String {
let len = self.get_range().len();
let literal = LazyLiteral::new(template, len);
let action = literal.mimic_case_action(self.get_match());

literal.handle_mimic_action(action)
}
}
7 changes: 6 additions & 1 deletion src/pass.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ pub struct Pass {
}

// skips 20 pages of debug output of `multi_regex` field
#[allow(clippy::missing_fields_in_debug)]
impl fmt::Debug for Pass {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Pass")
Expand Down Expand Up @@ -83,6 +84,7 @@ impl Pass {
}

/// Produces string with all non-overlapping regexes replaced by corresponding tags
#[must_use]
pub fn apply<'a>(&self, text: &'a str) -> Cow<'a, str> {
let all_captures: Vec<_> = self.multi_regex.captures_iter(text).collect();

Expand All @@ -94,7 +96,10 @@ impl Pass {
let mut output = String::with_capacity(text.len());

for caps in all_captures {
let caps_match = caps.get_match().expect("this matched");
// SAFETY: these captures come from matches. The only way this can fail is if they were
// created manually with Captures::empty()
let caps_match = unsafe { caps.get_match().unwrap_unchecked() };

let range = caps_match.range();
let tag = &self.tags[caps_match.pattern()];

Expand Down
Loading

0 comments on commit 9ca79ea

Please sign in to comment.