WorksApplications · mh-northlander · May 31, 2024 · May 31, 2024 · Jun 3, 2024 · Jun 28, 2024
diff --git a/python/src/word_info.rs b/python/src/word_info.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -33,9 +33,9 @@ pub struct PyWordInfo {
     synonym_group_ids: Vec<u32>,
 }
 
-fn copy_if_empty(v1: String, v2: &String) -> String {
+fn copy_if_empty(v1: String, v2: &str) -> String {
     if v1.is_empty() {
-        v2.clone()
+        v2.to_owned()
     } else {
         v1
     }

diff --git a/sudachi-cli/src/analysis.rs b/sudachi-cli/src/analysis.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -51,14 +51,14 @@ impl<'a> Analysis for SplitSentencesOnly<'a> {
     }
 }
 
-pub struct AnalyzeNonSplitted<D: DictionaryAccess, O: SudachiOutput<D>> {
-    output: O,
+pub struct AnalyzeNonSplitted<D: DictionaryAccess, O: SudachiOutput<D> + ?Sized> {
+    output: Box<O>,
     analyzer: StatefulTokenizer<D>,
     morphemes: MorphemeList<D>,
 }
 
-impl<D: DictionaryAccess + Clone, O: SudachiOutput<D>> AnalyzeNonSplitted<D, O> {
-    pub fn new(output: O, dict: D, mode: Mode, enable_debug: bool) -> Self {
+impl<D: DictionaryAccess + Clone, O: SudachiOutput<D> + ?Sized> AnalyzeNonSplitted<D, O> {
+    pub fn new(output: Box<O>, dict: D, mode: Mode, enable_debug: bool) -> Self {
         Self {
             output,
             morphemes: MorphemeList::empty(dict.clone()),
@@ -67,7 +67,7 @@ impl<D: DictionaryAccess + Clone, O: SudachiOutput<D>> AnalyzeNonSplitted<D, O>
     }
 }
 
-impl<D: DictionaryAccess, O: SudachiOutput<D>> Analysis for AnalyzeNonSplitted<D, O> {
+impl<D: DictionaryAccess, O: SudachiOutput<D> + ?Sized> Analysis for AnalyzeNonSplitted<D, O> {
     fn analyze(&mut self, input: &str, writer: &mut Writer) {
         self.analyzer.reset().push_str(input);
         self.analyzer
@@ -86,21 +86,23 @@ impl<D: DictionaryAccess, O: SudachiOutput<D>> Analysis for AnalyzeNonSplitted<D
     }
 }
 
-pub struct AnalyzeSplitted<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> {
+pub struct AnalyzeSplitted<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> {
     splitter: SentenceSplitter<'a>,
     inner: AnalyzeNonSplitted<&'a D, O>,
 }
 
-impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> AnalyzeSplitted<'a, D, O> {
-    pub fn new(output: O, dict: &'a D, mode: Mode, enable_debug: bool) -> Self {
+impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> AnalyzeSplitted<'a, D, O> {
+    pub fn new(output: Box<O>, dict: &'a D, mode: Mode, enable_debug: bool) -> Self {
         Self {
             inner: AnalyzeNonSplitted::new(output, dict, mode, enable_debug),
             splitter: SentenceSplitter::new().with_checker(dict.lexicon()),
         }
     }
 }
 
-impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> Analysis for AnalyzeSplitted<'a, D, O> {
+impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> Analysis
+    for AnalyzeSplitted<'a, D, O>
+{
     fn analyze(&mut self, input: &str, writer: &mut Writer) {
         for (_, sent) in self.splitter.split(input) {
             self.inner.analyze(sent, writer);

diff --git a/sudachi-cli/src/main.rs b/sudachi-cli/src/main.rs
@@ -27,14 +27,12 @@ use clap::Parser;
 
 use crate::analysis::{Analysis, AnalyzeNonSplitted, AnalyzeSplitted, SplitSentencesOnly};
 use crate::build::{build_main, is_build_mode, BuildCli};
+use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
 use sudachi::config::Config;
 use sudachi::dic::dictionary::JapaneseDictionary;
 use sudachi::prelude::*;
 
-#[cfg(feature = "bake_dictionary")]
-const BAKED_DICTIONARY_BYTES: &[u8] = include_bytes!(env!("SUDACHI_DICT_PATH"));
-
-#[derive(Clone, Debug, Eq, PartialEq, Default)]
+#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
 pub enum SentenceSplitMode {
     /// Do both sentence splitting and analysis
     #[default]
@@ -117,16 +115,33 @@ struct Cli {
     command: Option<BuildCli>,
 }
 
-// want to instantiate a different type for different output format
-// this takes a f as a function which will be created with a different actual type
-macro_rules! with_output {
-    ($cli: expr, $f: expr) => {
-        if $cli.wakati {
-            Box::new($f(output::Wakachi::default()))
-        } else {
-            Box::new($f(output::Simple::new($cli.print_all)))
-        }
-    };
+pub fn setup_output<D: DictionaryAccess>(
+    wakachi: bool,
+    print_all: bool,
+) -> Box<dyn output::SudachiOutput<D>> {
+    if wakachi {
+        Box::new(output::Wakachi::default())
+    } else {
+        Box::new(output::Simple::new(print_all))
+    }
+}
+
+fn setup_analyzer<'a>(args: &Cli, dict: &'a impl DictionaryAccess) -> Box<dyn Analysis + 'a> {
+    match args.split_sentences {
+        SentenceSplitMode::Only => Box::new(SplitSentencesOnly::new(dict)),
+        SentenceSplitMode::Default => Box::new(AnalyzeSplitted::new(
+            setup_output(args.wakati, args.print_all),
+            dict,
+            args.mode,
+            args.enable_debug,
+        )),
+        SentenceSplitMode::None => Box::new(AnalyzeNonSplitted::new(
+            setup_output(args.wakati, args.print_all),
+            dict,
+            args.mode,
+            args.enable_debug,
+        )),
+    }
 }
 
 fn main() {
@@ -169,15 +184,7 @@ fn main() {
     let dict = JapaneseDictionary::from_cfg(&config)
         .unwrap_or_else(|e| panic!("Failed to create dictionary: {:?}", e));
 
-    let mut analyzer: Box<dyn Analysis> = match args.split_sentences {
-        SentenceSplitMode::Only => Box::new(SplitSentencesOnly::new(&dict)),
-        SentenceSplitMode::Default => with_output!(args, |o| {
-            AnalyzeSplitted::new(o, &dict, args.mode, args.enable_debug)
-        }),
-        SentenceSplitMode::None => with_output!(args, |o| {
-            AnalyzeNonSplitted::new(o, &dict, args.mode, args.enable_debug)
-        }),
-    };
+    let mut analyzer: Box<dyn Analysis> = setup_analyzer(&args, &dict);
 
     let mut data = String::with_capacity(4 * 1024);
     let is_stdout = args.output_file.is_none();
@@ -213,6 +220,7 @@ fn strip_eol(data: &str) -> &str {
     // Safety: str was correct and we only removed full characters
     unsafe { std::str::from_utf8_unchecked(bytes) }
 }
+
 #[cfg(test)]
 mod tests {
     use clap::CommandFactory;

diff --git a/sudachi/src/analysis/stateless_tokenizer.rs b/sudachi/src/analysis/stateless_tokenizer.rs
@@ -132,7 +132,7 @@ pub(super) fn split_path<T: DictionaryAccess + ?Sized>(
     Ok(new_path)
 }
 
-pub(super) fn dump_path(path: &Vec<ResultNode>) {
+pub(super) fn dump_path(path: &[ResultNode]) {
     for (i, node) in path.iter().enumerate() {
         println!("{}: {}", i, node);
     }

diff --git a/sudachi/src/dic/build/lexicon.rs b/sudachi/src/dic/build/lexicon.rs
@@ -84,7 +84,7 @@ impl StrPosEntry {
         Self { data: owned }
     }
 
-    pub fn from_built_pos(data: &Vec<String>) -> Self {
+    pub fn from_built_pos(data: &[String]) -> Self {
         let mut iter = data.iter().map(|x| x.as_str());
         let p1 = Cow::Borrowed(iter.next().unwrap());
         let p2 = Cow::Borrowed(iter.next().unwrap());

diff --git a/sudachi/src/dic/lexicon/word_infos.rs b/sudachi/src/dic/lexicon/word_infos.rs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,6 @@
  * limitations under the License.
  */
 
-use std::iter::FusedIterator;
-
-use crate::dic::lexicon_set::LexiconSet;
 use crate::dic::read::u32_parser;
 use crate::dic::read::word_info::WordInfoParser;
 use crate::dic::subset::InfoSubset;
@@ -177,30 +174,3 @@ impl From<WordInfo> for WordInfoData {
         info.data
     }
 }
-
-struct SplitIter<'a> {
-    index: usize,
-    split: &'a [WordId],
-    lexicon: &'a LexiconSet<'a>,
-}
-
-impl Iterator for SplitIter<'_> {
-    type Item = SudachiResult<WordInfo>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let idx = self.index;
-        if idx >= self.split.len() {
-            None
-        } else {
-            self.index += 1;
-            Some(self.lexicon.get_word_info(self.split[idx]))
-        }
-    }
-
-    fn size_hint(&self) -> (usize, Option<usize>) {
-        let rem = self.split.len() - self.index;
-        (rem, Some(rem))
-    }
-}
-
-impl FusedIterator for SplitIter<'_> {}
diff --git a/sudachi/src/dic/lexicon_set.rs b/sudachi/src/dic/lexicon_set.rs
@@ -142,7 +142,7 @@ impl LexiconSet<'_> {
         self.lexicons[dic_id].get_word_param(id.word())
     }
 
-    fn update_dict_id(split: &mut Vec<WordId>, dict_id: u8) -> SudachiResult<()> {
+    fn update_dict_id(split: &mut [WordId], dict_id: u8) -> SudachiResult<()> {
         for id in split.iter_mut() {
             let cur_dict_id = id.dic();
             if cur_dict_id > 0 {

diff --git a/sudachi/src/error.rs b/sudachi/src/error.rs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 use std::fmt::Debug;
-use std::io::Error;
 use thiserror::Error;
 
 use crate::config::ConfigError;
@@ -50,7 +49,7 @@ pub enum SudachiError {
     FromUtf16(#[from] std::string::FromUtf16Error),
 
     #[error("Regex error")]
-    RegexError(#[from] fancy_regex::Error),
+    RegexError { cause: Box<fancy_regex::Error> },
 
     #[error("Error from nom {0}")]
     NomParseError(String),
@@ -108,14 +107,20 @@ pub enum SudachiError {
 }
 
 impl From<std::io::Error> for SudachiError {
-    fn from(e: Error) -> Self {
+    fn from(e: std::io::Error) -> Self {
         SudachiError::Io {
             cause: e,
             context: String::from("IO Error"),
         }
     }
 }
 
+impl From<fancy_regex::Error> for SudachiError {
+    fn from(e: fancy_regex::Error) -> Self {
+        SudachiError::RegexError { cause: Box::new(e) }
+    }
+}
+
 impl SudachiError {
     pub fn with_context<S: Into<String>>(self, ctx: S) -> Self {
         match self {

diff --git a/sudachi/src/input_text/buffer/edit.rs b/sudachi/src/input_text/buffer/edit.rs
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *  Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -89,7 +89,7 @@ impl<'a> InputEditor<'a> {
 // Current plugin implementations satisfy this criteria.
 pub fn resolve_edits(
     source: &str,
-    source_mapping: &Vec<usize>,
+    source_mapping: &[usize],
     target: &mut String,
     target_mapping: &mut Vec<usize>,
     edits: &mut Vec<ReplaceOp>,
@@ -127,7 +127,7 @@ pub fn resolve_edits(
 }
 
 fn add_replace(
-    source_mapping: &Vec<usize>,
+    source_mapping: &[usize],
     target: &mut String,
     target_mapping: &mut Vec<usize>,
     what: Range<usize>,

diff --git a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs
@@ -179,7 +179,7 @@ impl NumericParser {
     }
 
     pub fn get_normalized(&mut self) -> String {
-        self.total.to_string()
+        self.total.get_normalized()
     }
 
     fn check_comma(&self) -> bool {

diff --git a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs
@@ -100,7 +100,7 @@ impl StringNumber {
         self.significand.is_empty()
     }
 
-    pub fn to_string(&mut self) -> String {
+    pub fn get_normalized(&mut self) -> String {
         if self.is_zero() {
             return "0".to_owned();
         }