diff --git a/python/src/word_info.rs b/python/src/word_info.rs index eb51a28d..50283402 100644 --- a/python/src/word_info.rs +++ b/python/src/word_info.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,9 +33,9 @@ pub struct PyWordInfo { synonym_group_ids: Vec, } -fn copy_if_empty(v1: String, v2: &String) -> String { +fn copy_if_empty(v1: String, v2: &str) -> String { if v1.is_empty() { - v2.clone() + v2.to_owned() } else { v1 } diff --git a/sudachi-cli/src/analysis.rs b/sudachi-cli/src/analysis.rs index 89bca916..d95881c1 100644 --- a/sudachi-cli/src/analysis.rs +++ b/sudachi-cli/src/analysis.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,14 +51,14 @@ impl<'a> Analysis for SplitSentencesOnly<'a> { } } -pub struct AnalyzeNonSplitted> { - output: O, +pub struct AnalyzeNonSplitted + ?Sized> { + output: Box, analyzer: StatefulTokenizer, morphemes: MorphemeList, } -impl> AnalyzeNonSplitted { - pub fn new(output: O, dict: D, mode: Mode, enable_debug: bool) -> Self { +impl + ?Sized> AnalyzeNonSplitted { + pub fn new(output: Box, dict: D, mode: Mode, enable_debug: bool) -> Self { Self { output, morphemes: MorphemeList::empty(dict.clone()), @@ -67,7 +67,7 @@ impl> AnalyzeNonSplitted } } -impl> Analysis for AnalyzeNonSplitted { +impl + ?Sized> Analysis for AnalyzeNonSplitted { fn analyze(&mut self, input: &str, writer: &mut Writer) { self.analyzer.reset().push_str(input); self.analyzer @@ -86,13 +86,13 @@ impl> Analysis for AnalyzeNonSplitted> { +pub struct AnalyzeSplitted<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> { splitter: SentenceSplitter<'a>, inner: AnalyzeNonSplitted<&'a D, O>, } -impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> AnalyzeSplitted<'a, D, O> { - pub fn new(output: O, dict: &'a D, mode: Mode, enable_debug: bool) -> Self { +impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> AnalyzeSplitted<'a, D, O> { + pub fn new(output: Box, dict: &'a D, mode: Mode, enable_debug: bool) -> Self { Self { inner: AnalyzeNonSplitted::new(output, dict, mode, enable_debug), splitter: SentenceSplitter::new().with_checker(dict.lexicon()), @@ -100,7 +100,9 @@ impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> AnalyzeSplitted<'a, } } -impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> Analysis for AnalyzeSplitted<'a, D, O> { +impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> Analysis + for AnalyzeSplitted<'a, D, O> +{ fn analyze(&mut self, input: &str, writer: &mut Writer) { for (_, sent) in self.splitter.split(input) { self.inner.analyze(sent, writer); diff --git a/sudachi-cli/src/main.rs b/sudachi-cli/src/main.rs index 3d825d15..ceac0791 100644 --- a/sudachi-cli/src/main.rs +++ b/sudachi-cli/src/main.rs @@ -27,14 +27,12 @@ use clap::Parser; use crate::analysis::{Analysis, AnalyzeNonSplitted, AnalyzeSplitted, SplitSentencesOnly}; use crate::build::{build_main, is_build_mode, BuildCli}; +use sudachi::analysis::stateless_tokenizer::DictionaryAccess; use sudachi::config::Config; use sudachi::dic::dictionary::JapaneseDictionary; use sudachi::prelude::*; -#[cfg(feature = "bake_dictionary")] -const BAKED_DICTIONARY_BYTES: &[u8] = include_bytes!(env!("SUDACHI_DICT_PATH")); - -#[derive(Clone, Debug, Eq, PartialEq, Default)] +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub enum SentenceSplitMode { /// Do both sentence splitting and analysis #[default] @@ -117,16 +115,33 @@ struct Cli { command: Option, } -// want to instantiate a different type for different output format -// this takes a f as a function which will be created with a different actual type -macro_rules! with_output { - ($cli: expr, $f: expr) => { - if $cli.wakati { - Box::new($f(output::Wakachi::default())) - } else { - Box::new($f(output::Simple::new($cli.print_all))) - } - }; +pub fn setup_output( + wakachi: bool, + print_all: bool, +) -> Box> { + if wakachi { + Box::new(output::Wakachi::default()) + } else { + Box::new(output::Simple::new(print_all)) + } +} + +fn setup_analyzer<'a>(args: &Cli, dict: &'a impl DictionaryAccess) -> Box { + match args.split_sentences { + SentenceSplitMode::Only => Box::new(SplitSentencesOnly::new(dict)), + SentenceSplitMode::Default => Box::new(AnalyzeSplitted::new( + setup_output(args.wakati, args.print_all), + dict, + args.mode, + args.enable_debug, + )), + SentenceSplitMode::None => Box::new(AnalyzeNonSplitted::new( + setup_output(args.wakati, args.print_all), + dict, + args.mode, + args.enable_debug, + )), + } } fn main() { @@ -169,15 +184,7 @@ fn main() { let dict = JapaneseDictionary::from_cfg(&config) .unwrap_or_else(|e| panic!("Failed to create dictionary: {:?}", e)); - let mut analyzer: Box = match args.split_sentences { - SentenceSplitMode::Only => Box::new(SplitSentencesOnly::new(&dict)), - SentenceSplitMode::Default => with_output!(args, |o| { - AnalyzeSplitted::new(o, &dict, args.mode, args.enable_debug) - }), - SentenceSplitMode::None => with_output!(args, |o| { - AnalyzeNonSplitted::new(o, &dict, args.mode, args.enable_debug) - }), - }; + let mut analyzer: Box = setup_analyzer(&args, &dict); let mut data = String::with_capacity(4 * 1024); let is_stdout = args.output_file.is_none(); @@ -213,6 +220,7 @@ fn strip_eol(data: &str) -> &str { // Safety: str was correct and we only removed full characters unsafe { std::str::from_utf8_unchecked(bytes) } } + #[cfg(test)] mod tests { use clap::CommandFactory; diff --git a/sudachi/src/analysis/stateless_tokenizer.rs b/sudachi/src/analysis/stateless_tokenizer.rs index 669d7720..eabee275 100644 --- a/sudachi/src/analysis/stateless_tokenizer.rs +++ b/sudachi/src/analysis/stateless_tokenizer.rs @@ -132,7 +132,7 @@ pub(super) fn split_path( Ok(new_path) } -pub(super) fn dump_path(path: &Vec) { +pub(super) fn dump_path(path: &[ResultNode]) { for (i, node) in path.iter().enumerate() { println!("{}: {}", i, node); } diff --git a/sudachi/src/dic/build/lexicon.rs b/sudachi/src/dic/build/lexicon.rs index 49e44963..6514c4e7 100644 --- a/sudachi/src/dic/build/lexicon.rs +++ b/sudachi/src/dic/build/lexicon.rs @@ -84,7 +84,7 @@ impl StrPosEntry { Self { data: owned } } - pub fn from_built_pos(data: &Vec) -> Self { + pub fn from_built_pos(data: &[String]) -> Self { let mut iter = data.iter().map(|x| x.as_str()); let p1 = Cow::Borrowed(iter.next().unwrap()); let p2 = Cow::Borrowed(iter.next().unwrap()); diff --git a/sudachi/src/dic/lexicon/word_infos.rs b/sudachi/src/dic/lexicon/word_infos.rs index 4be9cf85..8b6e22dd 100644 --- a/sudachi/src/dic/lexicon/word_infos.rs +++ b/sudachi/src/dic/lexicon/word_infos.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,6 @@ * limitations under the License. */ -use std::iter::FusedIterator; - -use crate::dic::lexicon_set::LexiconSet; use crate::dic::read::u32_parser; use crate::dic::read::word_info::WordInfoParser; use crate::dic::subset::InfoSubset; @@ -177,30 +174,3 @@ impl From for WordInfoData { info.data } } - -struct SplitIter<'a> { - index: usize, - split: &'a [WordId], - lexicon: &'a LexiconSet<'a>, -} - -impl Iterator for SplitIter<'_> { - type Item = SudachiResult; - - fn next(&mut self) -> Option { - let idx = self.index; - if idx >= self.split.len() { - None - } else { - self.index += 1; - Some(self.lexicon.get_word_info(self.split[idx])) - } - } - - fn size_hint(&self) -> (usize, Option) { - let rem = self.split.len() - self.index; - (rem, Some(rem)) - } -} - -impl FusedIterator for SplitIter<'_> {} diff --git a/sudachi/src/dic/lexicon_set.rs b/sudachi/src/dic/lexicon_set.rs index 3a00ab68..1f41632b 100644 --- a/sudachi/src/dic/lexicon_set.rs +++ b/sudachi/src/dic/lexicon_set.rs @@ -142,7 +142,7 @@ impl LexiconSet<'_> { self.lexicons[dic_id].get_word_param(id.word()) } - fn update_dict_id(split: &mut Vec, dict_id: u8) -> SudachiResult<()> { + fn update_dict_id(split: &mut [WordId], dict_id: u8) -> SudachiResult<()> { for id in split.iter_mut() { let cur_dict_id = id.dic(); if cur_dict_id > 0 { diff --git a/sudachi/src/error.rs b/sudachi/src/error.rs index 4e43b64a..34a10498 100644 --- a/sudachi/src/error.rs +++ b/sudachi/src/error.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ use std::fmt::Debug; -use std::io::Error; use thiserror::Error; use crate::config::ConfigError; @@ -50,7 +49,7 @@ pub enum SudachiError { FromUtf16(#[from] std::string::FromUtf16Error), #[error("Regex error")] - RegexError(#[from] fancy_regex::Error), + RegexError { cause: Box }, #[error("Error from nom {0}")] NomParseError(String), @@ -108,7 +107,7 @@ pub enum SudachiError { } impl From for SudachiError { - fn from(e: Error) -> Self { + fn from(e: std::io::Error) -> Self { SudachiError::Io { cause: e, context: String::from("IO Error"), @@ -116,6 +115,12 @@ impl From for SudachiError { } } +impl From for SudachiError { + fn from(e: fancy_regex::Error) -> Self { + SudachiError::RegexError { cause: Box::new(e) } + } +} + impl SudachiError { pub fn with_context>(self, ctx: S) -> Self { match self { diff --git a/sudachi/src/input_text/buffer/edit.rs b/sudachi/src/input_text/buffer/edit.rs index 300f73ab..41ac5232 100644 --- a/sudachi/src/input_text/buffer/edit.rs +++ b/sudachi/src/input_text/buffer/edit.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,7 +89,7 @@ impl<'a> InputEditor<'a> { // Current plugin implementations satisfy this criteria. pub fn resolve_edits( source: &str, - source_mapping: &Vec, + source_mapping: &[usize], target: &mut String, target_mapping: &mut Vec, edits: &mut Vec, @@ -127,7 +127,7 @@ pub fn resolve_edits( } fn add_replace( - source_mapping: &Vec, + source_mapping: &[usize], target: &mut String, target_mapping: &mut Vec, what: Range, diff --git a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs index 27c3c481..da2e87cc 100644 --- a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs +++ b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs @@ -179,7 +179,7 @@ impl NumericParser { } pub fn get_normalized(&mut self) -> String { - self.total.to_string() + self.total.get_normalized() } fn check_comma(&self) -> bool { diff --git a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs index eaf86e60..7e03ffe7 100644 --- a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs +++ b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs @@ -100,7 +100,7 @@ impl StringNumber { self.significand.is_empty() } - pub fn to_string(&mut self) -> String { + pub fn get_normalized(&mut self) -> String { if self.is_zero() { return "0".to_owned(); }