Skip to content

Commit

Permalink
Change inner regex library from "regex" to "fancy_regex" to support `…
Browse files Browse the repository at this point in the history
…--locator` option's value using syntax like lookaround.
  • Loading branch information
boholder committed Dec 28, 2021
1 parent 8502c4a commit 3d0b464
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 120 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "naming_clt"
version = "1.0.0"
version = "1.1.0"
authors = ["Boholder <[email protected]>"]
description = """
Extract and convert the naming format(case|notation) of identifiers from files or stdin.
Expand All @@ -27,7 +27,7 @@ members = [
]

[dependencies]
naming_clt_lib = { version = "0.1.0", path = "crates/naming_clt_lib" }
naming_clt_lib = { version = "0.2.0", path = "crates/naming_clt_lib" }
atty = "~0.2.14"

[dependencies.clap]
Expand Down
55 changes: 11 additions & 44 deletions crates/core/app.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ fn args<'a, 'b>() -> Box<[Arg<'a, 'b>]> {
.help(
"Set locator pairs around identifiers, \
the separator between prefix and suffix is a space; \
default(1): \"\\s \\s\"",
default(1): \"(?<=\\b)<word>(?=\\b)\"",
)
.takes_value(true)
.multiple(true)
Expand Down Expand Up @@ -139,54 +139,21 @@ to the line contains given eof-word, to this tool.

const LOCATOR_HELP: &str = "\
Set locator pairs around identifiers, in each pair value,
the delimiter between prefix and suffix is a space.
[!] Although this tool provides this option for convenience,
be careful when using it. See warning information below for detail.
the delimiter between prefix and suffix is a pair of brackets \"{}\".
The two parts of a pair are inserted directly into the
regular pattern, so users need to manually escape the characters
that need to be escaped. For [the regex syntax], check this document:
https://docs.rs/regex/1.5.4/regex/index.html#syntax
Default(1): \"\\b \\b\", which will match the valid identifiers
separated by \"Unicode word boundary (\\w on one side and \\W,
\\A, or \\z on other)\" positions.
Each value passed to this option will be transformed to an regex
pattern looks like: \"(?:<prefix>|\\A)(identifier)(?:<suffix>|\\z)\",
where \\A for matching the start of file position and
\\z for matching the end of file position.
So there is no need to worry about having a match right next to
the start or end of the file. If the input is only one valid word,
that word will also be matched because of the hardcoded logic
described above.
[!]WARNING:
Sorry for letting you know these implementation details.
The rust regex library \"regex\" will matches text with a
\"non-overlapping\" way, while it doesn't support lookaround syntax.
So for the implementation, I used non-capture tuples to
match prefixes and suffixes, which have mentioned above.
An unintentional locator value, for example \"\\s \\s\",
will generates a pattern \"(?:\\s|\\A)([a-zA-Z0-9_-]+)(?:\\s|\\z)\".
This pattern will only matches \"a\" and \"c\" on text \"a b c\",
while \"b\" shares two space symbols with \"a\" and \"c\".
First matches \"<start of file>a \", remain \"b c<eof>\",
then matches \" c<eof>\", further extracting via group number
gets \"a\" and \"c\" as final result.
As you can see, locators that represent characters rather than
positions may let to this unwanted result. Please use this option
carefully and make sure the output of this tool is as expected,
or use the grep method mentioned below as an alternative to
extracting words from input text.
[!]NOTE: Any incomplete pair value that couldn't be split
into two part by space delimiter like \"a\",\" a\",\"a \"
https://docs.rs/fancy-regex/0.7.1/fancy_regex/index.html#syntax
Default(1): \"(?<=\\b){}(?=\\b)\", which will match the valid identifiers
separated by \"Unicode word boundary (\\w on one side and \\W, \\A, or \\z
on the other side)\" positions.
NOTE:
Any incomplete pair value that couldn't be split
into two part by space delimiter like \"a\",\"{}a\",\"a{}\"
will let the tool outputs nothing and exits with non-zero signal.
NOTE:
Expand Down
4 changes: 2 additions & 2 deletions crates/naming_clt_lib/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "naming_clt_lib"
version = "0.1.0"
version = "0.2.0"
authors = ["Boholder <[email protected]>"]
description = """
Functions which serve the command line tool [naming].
Expand All @@ -10,6 +10,6 @@ edition = "2018"
license = "MIT"

[dependencies]
regex = "~1.5.4"
fancy-regex = "~0.7.1"
lazy_static = "~1.4.0"
naming_lib = "~0.1.4"
45 changes: 22 additions & 23 deletions crates/naming_clt_lib/src/extraction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,29 +2,29 @@ use std::collections::HashSet;
use std::fs;
use std::io::{self, BufRead};

use regex::{self, Regex};
use fancy_regex::Regex;

/// Convert Vec<String> into a slice of &str in Rust:
/// https://stackoverflow.com/a/41180422/11397457
pub fn read_from_files<T: AsRef<str>>(
files: &[T],
eof: Option<&str>,
logical_eof: Option<&str>,
) -> Result<Vec<String>, String> {
let mut text = Vec::new();
for file in files {
text.push(read_file(file.as_ref(), eof)?);
text.push(read_file(file.as_ref(), logical_eof)?);
}
Ok(text)
}

fn read_file(file: &str, eof: Option<&str>) -> Result<String, String> {
match fs::read_to_string(file) {
Ok(text) => Ok(apply_eof_on_text(eof, text)),
Ok(text) => Ok(trim_text_with_logical_eof(eof, text)),
Err(msg) => Err(format!("naming: {}: {}", file, msg)),
}
}

fn apply_eof_on_text(eof: Option<&str>, text: String) -> String {
fn trim_text_with_logical_eof(eof: Option<&str>, text: String) -> String {
match eof {
None => text,
Some(eof) => {
Expand All @@ -45,7 +45,7 @@ where
{
let mut buffer = String::new();
match input.read_to_string(&mut buffer) {
Ok(_) => Ok(apply_eof_on_text(eof, buffer)),
Ok(_) => Ok(trim_text_with_logical_eof(eof, buffer)),
Err(msg) => Err(format!("naming: stdin: {}", msg)),
}
}
Expand All @@ -60,7 +60,10 @@ impl Captor {
/// Options should be manually escaped by user.
/// If there is a locator pair which couldn't be converted to regex, return an Err.
pub fn new(locators: Option<Vec<String>>) -> Result<Captor, String> {
let locators = locators.unwrap_or_else(|| vec![r"\b \b".to_string()]);
// TODO 手册里写双括号来代表单词
// Set default locator as word edge '\b'.
let locators =
locators.unwrap_or_else(|| vec![r"(?<=\b){}(?=\b)".to_string()]);
let mut patterns = Vec::new();
for locator in locators {
patterns.push(Captor::build_pattern_from(locator)?);
Expand All @@ -69,21 +72,19 @@ impl Captor {
}

fn build_pattern_from(locator: String) -> Result<Regex, String> {
let pair = locator.split_once(" ");
let pair = locator.split_once("{}");
if pair.is_none() {
return Err(format!(
"naming: locator `{}`: can't build locator pair from this.",
"naming: locator `{}`: can't split locator pair from this.",
locator
));
}
let pair = pair.unwrap();

Ok(Regex::new(
// \A for start of file position,
// \z for end of file position
&format!(r"(?:{}|\A)([a-zA-Z0-9_-]+)(?:{}|\z)", pair.0, pair.1),
)
.unwrap())
// Currently, we use `([a-zA-Z0-9_-]+)` to match words, it's ok for now,
// because it could match words with any naming format.
Ok(Regex::new(&format!(r"{}([a-zA-Z0-9_-]+){}", pair.0, pair.1))
.unwrap())
}

/// Extract words from given long text string,
Expand All @@ -96,7 +97,6 @@ impl Captor {
// https://users.rust-lang.org/t/deduplicate-vector-in-place-while-preserving-order/56568/6
let mut set = HashSet::new();
matches.retain(|word| set.insert(word.clone()));

matches
}

Expand All @@ -107,10 +107,9 @@ impl Captor {
self.patterns
.iter()
.map(move |pattern| {
pattern
.captures_iter(t)
.into_iter()
.map(|cap| cap[1].to_string())
pattern.captures_iter(t).into_iter().map(|cap| {
cap.unwrap().get(1).unwrap().as_str().to_string()
})
})
.flatten()
// now get one file's matches
Expand Down Expand Up @@ -164,7 +163,7 @@ mod captor_tests {
let text =
to_string_vec(vec!["int i = 1; String s = oneMethod(arg1, arg2);"]);
let actual =
Captor::new(Some(to_string_vec(vec![r"\s \s*=", r"\s \s*;"])))
Captor::new(Some(to_string_vec(vec![r"\s{}\s*=", r"\s{}\s*;"])))
.unwrap()
.capture_words(text);
let expect: Vec<String> = to_string_vec(vec!["i", "s", "1"]);
Expand All @@ -176,7 +175,7 @@ mod captor_tests {
let text = to_string_vec(vec!["@now#can$be&matched"]);
// note that "$" is manually escaped.
let locators: Vec<String> =
to_string_vec(vec![r"# \$", "@ #", r"\$ &", r"& \z"]);
to_string_vec(vec![r"#{}\$", "@{}#", r"\${}&", r"&{}\z"]);

let actual = Captor::new(Some(locators)).unwrap().capture_words(text);
// notice that the result order is based on option order.
Expand All @@ -192,7 +191,7 @@ mod captor_tests {
"let a = 1; let b = 2; let c = 3;",
]);
let actual =
Captor::new(Some(to_string_vec(vec![r"\s \s*=", r"\s \s*;"])))
Captor::new(Some(to_string_vec(vec![r"\s{}\s*=", r"\s{}\s*;"])))
.unwrap()
.capture_words(text);
// notice that the result order is based on option order.
Expand Down
44 changes: 5 additions & 39 deletions doc/USERGUIDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -111,53 +111,20 @@ OPTIONS:
-l, --locator <locator>...
Set locator pairs around identifiers, in each pair value,
the delimiter between prefix and suffix is a space.
[!] Although this tool provides this option for convenience,
be careful when using it. See warning information below for detail.
the delimiter between prefix and suffix is a pair of brackets "{}".
The two parts of a pair are inserted directly into the
regular pattern, so users need to manually escape the characters
that need to be escaped. For [the regex syntax], check this document:
https://docs.rs/regex/1.5.4/regex/index.html#syntax
https://docs.rs/fancy-regex/0.7.1/fancy_regex/index.html#syntax
Default(1): "\b \b", which will match the valid identifiers
Default(1): "(?<=\b){}(?=\b)", which will match the valid identifiers
separated by "Unicode word boundary (\w on one side and \W,
\A, or \z on other)" positions.
Each value passed to this option will be transformed to an regex
pattern looks like: "(?:<prefix>|\A)(identifier)(?:<suffix>|\z)",
where \A for matching the start of file position and
\z for matching the end of file position.
So there is no need to worry about having a match right next to
the start or end of the file. If the input is only one valid word,
that word will also be matched because of the hardcoded logic
described above.
[!]WARNING:
Sorry for letting you know these implementation details.
The rust regex library "regex" will matches text with a
"non-overlapping" way, while it doesn't support lookaround syntax.
So for the implementation, I used non-capture tuples to
match prefixes and suffixes, which have mentioned above.
An unintentional locator value, for example "\s \s",
will generates a pattern "(?:\s|\A)([a-zA-Z0-9_-]+)(?:\s|\z)".
This pattern will only matches "a" and "c" on text "a b c",
while "b" shares two space symbols with "a" and "c".
First matches "<start of file>a ", remain "b c<eof>",
then matches " c<eof>", further extracting via group number
gets "a" and "c" as final result.
As you can see, locators that represent characters rather than
positions may let to this unwanted result. Please use this option
carefully and make sure the output of this tool is as expected,
or use the grep method mentioned below as an alternative to
extracting words from input text.
[!]NOTE: Any incomplete pair value that couldn't be split
NOTE:
Any incomplete pair value that couldn't be split
into two part by space delimiter like "a"," a","a "
will let the tool outputs nothing and exits with non-zero signal.
Expand All @@ -181,7 +148,6 @@ OPTIONS:
Commands above piped "first\nsecond" to this tool.
-o, --output <output>...
Set which naming cases that matches will be converted to.
Expand Down
14 changes: 4 additions & 10 deletions tests/flags_and_options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,27 +42,21 @@ SCREAMING_SNAKE SCREAMING_SNAKE screaming_snake screaming-snake screamingSnake S
#[test]
fn locator() {
let mut cmd = Command::cargo_bin("naming").unwrap();
cmd.arg(r"--locator=\scam Case\s,ke -case")
cmd.arg(r"--locator=\scam{}Case\s,ke{}-case")
.arg("tests/data/all.txt")
.assert()
.success()
.stdout("el EL el el el El\nbab BAB bab bab bab Bab");
}

#[test]
fn locator_invalid_when_single_valid_word_appears_in_special_position() {
fn locator_support_lookaround() {
let mut cmd = Command::cargo_bin("naming").unwrap();
// Note that in one.txt, word "userId " has a space after it.
// If remove that space, the hardcoded pair "\A(word)\z"
// will match the "userId" because:
// 1. it's a valid word
// 2. it's head position is "start of file" (\A),
// and tail position is "end of file" (\z)
cmd.arg(r#"--locator="\s Id""#)
cmd.arg(r"--locator=(?<=u){}(?=Id)")
.arg("tests/data/one.txt")
.assert()
.success()
.stdout("userId USER_ID user_id user-id userId UserId");
.stdout("ser SER ser ser ser Ser");
}

#[test]
Expand Down

0 comments on commit 3d0b464

Please sign in to comment.