Change inner regex library from "regex" to "fancy_regex" to support `…

…--locator` option's value using syntax like lookaround.
boholder · Dec 28, 2021 · 3d0b464 · 3d0b464
1 parent 8502c4a
commit 3d0b464
Show file tree

Hide file tree

Showing 6 changed files with 46 additions and 120 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "naming_clt"
-version = "1.0.0"
+version = "1.1.0"
 authors = ["Boholder <[email protected]>"]
 description = """
 Extract and convert the naming format(case|notation) of identifiers from files or stdin.
@@ -27,7 +27,7 @@ members = [
 ]
 
 [dependencies]
-naming_clt_lib = { version = "0.1.0", path = "crates/naming_clt_lib" }
+naming_clt_lib = { version = "0.2.0", path = "crates/naming_clt_lib" }
 atty = "~0.2.14"
 
 [dependencies.clap]

diff --git a/crates/core/app.rs b/crates/core/app.rs
@@ -67,7 +67,7 @@ fn args<'a, 'b>() -> Box<[Arg<'a, 'b>]> {
             .help(
                 "Set locator pairs around identifiers, \
                 the separator between prefix and suffix is a space; \
-                default(1): \"\\s \\s\"",
+                default(1): \"(?<=\\b)<word>(?=\\b)\"",
             )
             .takes_value(true)
             .multiple(true)
@@ -139,54 +139,21 @@ to the line contains given eof-word, to this tool.
 
 const LOCATOR_HELP: &str = "\
 Set locator pairs around identifiers, in each pair value,
-the delimiter between prefix and suffix is a space.
-
-[!] Although this tool provides this option for convenience,
-be careful when using it. See warning information below for detail.
+the delimiter between prefix and suffix is a pair of brackets \"{}\".
 
 The two parts of a pair are inserted directly into the
 regular pattern, so users need to manually escape the characters
 that need to be escaped. For [the regex syntax], check this document:
 
-    https://docs.rs/regex/1.5.4/regex/index.html#syntax
-
-Default(1): \"\\b \\b\", which will match the valid identifiers
-separated by \"Unicode word boundary (\\w on one side and \\W,
-\\A, or \\z on other)\" positions.
-
-Each value passed to this option will be transformed to an regex
-pattern looks like: \"(?:<prefix>|\\A)(identifier)(?:<suffix>|\\z)\",
-where \\A for matching the start of file position and
-\\z for matching the end of file position.
-
-So there is no need to worry about having a match right next to
-the start or end of the file. If the input is only one valid word,
-that word will also be matched because of the hardcoded logic
-described above.
-
-[!]WARNING:
-Sorry for letting you know these implementation details.
-The rust regex library \"regex\" will matches text with a
-\"non-overlapping\" way, while it doesn't support lookaround syntax.
-So for the implementation, I used non-capture tuples to
-match prefixes and suffixes, which have mentioned above.
-
-An unintentional locator value, for example \"\\s \\s\",
-will generates a pattern \"(?:\\s|\\A)([a-zA-Z0-9_-]+)(?:\\s|\\z)\".
-This pattern will only matches \"a\" and \"c\" on text \"a b c\",
-while \"b\" shares two space symbols with \"a\" and \"c\".
-First matches \"<start of file>a \", remain \"b c<eof>\",
-then matches \" c<eof>\", further extracting via group number
-gets \"a\" and \"c\" as final result.
-
-As you can see, locators that represent characters rather than
-positions may let to this unwanted result. Please use this option
-carefully and make sure the output of this tool is as expected,
-or use the grep method mentioned below as an alternative to
-extracting words from input text.
-
-[!]NOTE: Any incomplete pair value that couldn't be split
-into two part by space delimiter like \"a\",\" a\",\"a \"
+    https://docs.rs/fancy-regex/0.7.1/fancy_regex/index.html#syntax
+
+Default(1): \"(?<=\\b){}(?=\\b)\", which will match the valid identifiers
+separated by \"Unicode word boundary (\\w on one side and \\W, \\A, or \\z
+on the other side)\" positions.
+
+NOTE:
+Any incomplete pair value that couldn't be split
+into two part by space delimiter like \"a\",\"{}a\",\"a{}\"
 will let the tool outputs nothing and exits with non-zero signal.
 
 NOTE:

diff --git a/crates/naming_clt_lib/Cargo.toml b/crates/naming_clt_lib/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "naming_clt_lib"
-version = "0.1.0"
+version = "0.2.0"
 authors = ["Boholder <[email protected]>"]
 description = """
 Functions which serve the command line tool [naming].
@@ -10,6 +10,6 @@ edition = "2018"
 license = "MIT"
 
 [dependencies]
-regex = "~1.5.4"
+fancy-regex = "~0.7.1"
 lazy_static = "~1.4.0"
 naming_lib = "~0.1.4"
diff --git a/crates/naming_clt_lib/src/extraction.rs b/crates/naming_clt_lib/src/extraction.rs
@@ -2,29 +2,29 @@ use std::collections::HashSet;
 use std::fs;
 use std::io::{self, BufRead};
 
-use regex::{self, Regex};
+use fancy_regex::Regex;
 
 /// Convert Vec<String> into a slice of &str in Rust:
 /// https://stackoverflow.com/a/41180422/11397457
 pub fn read_from_files<T: AsRef<str>>(
     files: &[T],
-    eof: Option<&str>,
+    logical_eof: Option<&str>,
 ) -> Result<Vec<String>, String> {
     let mut text = Vec::new();
     for file in files {
-        text.push(read_file(file.as_ref(), eof)?);
+        text.push(read_file(file.as_ref(), logical_eof)?);
     }
     Ok(text)
 }
 
 fn read_file(file: &str, eof: Option<&str>) -> Result<String, String> {
     match fs::read_to_string(file) {
-        Ok(text) => Ok(apply_eof_on_text(eof, text)),
+        Ok(text) => Ok(trim_text_with_logical_eof(eof, text)),
         Err(msg) => Err(format!("naming: {}: {}", file, msg)),
     }
 }
 
-fn apply_eof_on_text(eof: Option<&str>, text: String) -> String {
+fn trim_text_with_logical_eof(eof: Option<&str>, text: String) -> String {
     match eof {
         None => text,
         Some(eof) => {
@@ -45,7 +45,7 @@ where
 {
     let mut buffer = String::new();
     match input.read_to_string(&mut buffer) {
-        Ok(_) => Ok(apply_eof_on_text(eof, buffer)),
+        Ok(_) => Ok(trim_text_with_logical_eof(eof, buffer)),
         Err(msg) => Err(format!("naming: stdin: {}", msg)),
     }
 }
@@ -60,7 +60,10 @@ impl Captor {
     /// Options should be manually escaped by user.
     /// If there is a locator pair which couldn't be converted to regex, return an Err.
     pub fn new(locators: Option<Vec<String>>) -> Result<Captor, String> {
-        let locators = locators.unwrap_or_else(|| vec![r"\b \b".to_string()]);
+        // TODO 手册里写双括号来代表单词
+        // Set default locator as word edge '\b'.
+        let locators =
+            locators.unwrap_or_else(|| vec![r"(?<=\b){}(?=\b)".to_string()]);
         let mut patterns = Vec::new();
         for locator in locators {
             patterns.push(Captor::build_pattern_from(locator)?);
@@ -69,21 +72,19 @@ impl Captor {
     }
 
     fn build_pattern_from(locator: String) -> Result<Regex, String> {
-        let pair = locator.split_once(" ");
+        let pair = locator.split_once("{}");
         if pair.is_none() {
             return Err(format!(
-                "naming: locator `{}`: can't build locator pair from this.",
+                "naming: locator `{}`: can't split locator pair from this.",
                 locator
             ));
         }
         let pair = pair.unwrap();
 
-        Ok(Regex::new(
-            // \A for start of file position,
-            // \z for end of file position
-            &format!(r"(?:{}|\A)([a-zA-Z0-9_-]+)(?:{}|\z)", pair.0, pair.1),
-        )
-        .unwrap())
+        // Currently, we use `([a-zA-Z0-9_-]+)` to match words, it's ok for now,
+        // because it could match words with any naming format.
+        Ok(Regex::new(&format!(r"{}([a-zA-Z0-9_-]+){}", pair.0, pair.1))
+            .unwrap())
     }
 
     /// Extract words from given long text string,
@@ -96,7 +97,6 @@ impl Captor {
         // https://users.rust-lang.org/t/deduplicate-vector-in-place-while-preserving-order/56568/6
         let mut set = HashSet::new();
         matches.retain(|word| set.insert(word.clone()));
-
         matches
     }
 
@@ -107,10 +107,9 @@ impl Captor {
                 self.patterns
                     .iter()
                     .map(move |pattern| {
-                        pattern
-                            .captures_iter(t)
-                            .into_iter()
-                            .map(|cap| cap[1].to_string())
+                        pattern.captures_iter(t).into_iter().map(|cap| {
+                            cap.unwrap().get(1).unwrap().as_str().to_string()
+                        })
                     })
                     .flatten()
                 // now get one file's matches
@@ -164,7 +163,7 @@ mod captor_tests {
         let text =
             to_string_vec(vec!["int i = 1; String s = oneMethod(arg1, arg2);"]);
         let actual =
-            Captor::new(Some(to_string_vec(vec![r"\s \s*=", r"\s \s*;"])))
+            Captor::new(Some(to_string_vec(vec![r"\s{}\s*=", r"\s{}\s*;"])))
                 .unwrap()
                 .capture_words(text);
         let expect: Vec<String> = to_string_vec(vec!["i", "s", "1"]);
@@ -176,7 +175,7 @@ mod captor_tests {
         let text = to_string_vec(vec!["@now#can$be&matched"]);
         // note that "$" is manually escaped.
         let locators: Vec<String> =
-            to_string_vec(vec![r"# \$", "@ #", r"\$ &", r"& \z"]);
+            to_string_vec(vec![r"#{}\$", "@{}#", r"\${}&", r"&{}\z"]);
 
         let actual = Captor::new(Some(locators)).unwrap().capture_words(text);
         // notice that the result order is based on option order.
@@ -192,7 +191,7 @@ mod captor_tests {
             "let a = 1; let b = 2; let c = 3;",
         ]);
         let actual =
-            Captor::new(Some(to_string_vec(vec![r"\s \s*=", r"\s \s*;"])))
+            Captor::new(Some(to_string_vec(vec![r"\s{}\s*=", r"\s{}\s*;"])))
                 .unwrap()
                 .capture_words(text);
         // notice that the result order is based on option order.

diff --git a/doc/USERGUIDE.md b/doc/USERGUIDE.md
@@ -111,53 +111,20 @@ OPTIONS:
 
     -l, --locator <locator>...
             Set locator pairs around identifiers, in each pair value,
-            the delimiter between prefix and suffix is a space.
-
-            [!] Although this tool provides this option for convenience,
-            be careful when using it. See warning information below for detail.
+            the delimiter between prefix and suffix is a pair of brackets "{}".
 
             The two parts of a pair are inserted directly into the
             regular pattern, so users need to manually escape the characters
             that need to be escaped. For [the regex syntax], check this document:
 
-                https://docs.rs/regex/1.5.4/regex/index.html#syntax
+                https://docs.rs/fancy-regex/0.7.1/fancy_regex/index.html#syntax
 
-            Default(1): "\b \b", which will match the valid identifiers
+            Default(1): "(?<=\b){}(?=\b)", which will match the valid identifiers
             separated by "Unicode word boundary (\w on one side and \W,
             \A, or \z on other)" positions.
 
-            Each value passed to this option will be transformed to an regex
-            pattern looks like: "(?:<prefix>|\A)(identifier)(?:<suffix>|\z)",
-            where \A for matching the start of file position and
-            \z for matching the end of file position.
-
-            So there is no need to worry about having a match right next to
-            the start or end of the file. If the input is only one valid word,
-            that word will also be matched because of the hardcoded logic
-            described above.
-
-            [!]WARNING:
-            Sorry for letting you know these implementation details.
-            The rust regex library "regex" will matches text with a
-            "non-overlapping" way, while it doesn't support lookaround syntax.
-            So for the implementation, I used non-capture tuples to
-            match prefixes and suffixes, which have mentioned above.
-
-            An unintentional locator value, for example "\s \s",
-            will generates a pattern "(?:\s|\A)([a-zA-Z0-9_-]+)(?:\s|\z)".
-            This pattern will only matches "a" and "c" on text "a b c",
-            while "b" shares two space symbols with "a" and "c".
-            First matches "<start of file>a ", remain "b c<eof>",
-            then matches " c<eof>", further extracting via group number
-            gets "a" and "c" as final result.
-
-            As you can see, locators that represent characters rather than
-            positions may let to this unwanted result. Please use this option
-            carefully and make sure the output of this tool is as expected,
-            or use the grep method mentioned below as an alternative to
-            extracting words from input text.
-
-            [!]NOTE: Any incomplete pair value that couldn't be split
+            NOTE:
+            Any incomplete pair value that couldn't be split
             into two part by space delimiter like "a"," a","a "
             will let the tool outputs nothing and exits with non-zero signal.
 
@@ -181,7 +148,6 @@ OPTIONS:
 
             Commands above piped "first\nsecond" to this tool.
 
-
     -o, --output <output>...
             Set which naming cases that matches will be converted to.
 

diff --git a/tests/flags_and_options.rs b/tests/flags_and_options.rs
@@ -42,27 +42,21 @@ SCREAMING_SNAKE SCREAMING_SNAKE screaming_snake screaming-snake screamingSnake S
 #[test]
 fn locator() {
     let mut cmd = Command::cargo_bin("naming").unwrap();
-    cmd.arg(r"--locator=\scam Case\s,ke -case")
+    cmd.arg(r"--locator=\scam{}Case\s,ke{}-case")
         .arg("tests/data/all.txt")
         .assert()
         .success()
         .stdout("el EL el el el El\nbab BAB bab bab bab Bab");
 }
 
 #[test]
-fn locator_invalid_when_single_valid_word_appears_in_special_position() {
+fn locator_support_lookaround() {
     let mut cmd = Command::cargo_bin("naming").unwrap();
-    // Note that in one.txt, word "userId " has a space after it.
-    // If remove that space, the hardcoded pair "\A(word)\z"
-    // will match the "userId" because:
-    // 1. it's a valid word
-    // 2. it's head position is "start of file" (\A),
-    //    and tail position is "end of file" (\z)
-    cmd.arg(r#"--locator="\s Id""#)
+    cmd.arg(r"--locator=(?<=u){}(?=Id)")
         .arg("tests/data/one.txt")
         .assert()
         .success()
-        .stdout("userId USER_ID user_id user-id userId UserId");
+        .stdout("ser SER ser ser ser Ser");
 }
 
 #[test]