diff --git a/.github/workflows/rust-lint-fmt.yml b/.github/workflows/rust-lint-fmt.yml index f4a368fb..af733f52 100644 --- a/.github/workflows/rust-lint-fmt.yml +++ b/.github/workflows/rust-lint-fmt.yml @@ -10,8 +10,11 @@ env: jobs: build: runs-on: ubuntu-latest - steps: - uses: actions/checkout@v3 - name: Rustfmt - run: cargo fmt -- --check --files-with-diff \ No newline at end of file + run: cargo fmt -- --check + - name: Run Clippy + run: cargo clippy --all-targets --all-features + env: + RUSTFLAGS: "-Dwarnings" # fail on warning diff --git a/sudachi-cli/src/analysis.rs b/sudachi-cli/src/analysis.rs index 89bca916..d95881c1 100644 --- a/sudachi-cli/src/analysis.rs +++ b/sudachi-cli/src/analysis.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,14 +51,14 @@ impl<'a> Analysis for SplitSentencesOnly<'a> { } } -pub struct AnalyzeNonSplitted> { - output: O, +pub struct AnalyzeNonSplitted + ?Sized> { + output: Box, analyzer: StatefulTokenizer, morphemes: MorphemeList, } -impl> AnalyzeNonSplitted { - pub fn new(output: O, dict: D, mode: Mode, enable_debug: bool) -> Self { +impl + ?Sized> AnalyzeNonSplitted { + pub fn new(output: Box, dict: D, mode: Mode, enable_debug: bool) -> Self { Self { output, morphemes: MorphemeList::empty(dict.clone()), @@ -67,7 +67,7 @@ impl> AnalyzeNonSplitted } } -impl> Analysis for AnalyzeNonSplitted { +impl + ?Sized> Analysis for AnalyzeNonSplitted { fn analyze(&mut self, input: &str, writer: &mut Writer) { self.analyzer.reset().push_str(input); self.analyzer @@ -86,13 +86,13 @@ impl> Analysis for AnalyzeNonSplitted> { +pub struct AnalyzeSplitted<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> { splitter: SentenceSplitter<'a>, inner: AnalyzeNonSplitted<&'a D, O>, } -impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> AnalyzeSplitted<'a, D, O> { - pub fn new(output: O, dict: &'a D, mode: Mode, enable_debug: bool) -> Self { +impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> AnalyzeSplitted<'a, D, O> { + pub fn new(output: Box, dict: &'a D, mode: Mode, enable_debug: bool) -> Self { Self { inner: AnalyzeNonSplitted::new(output, dict, mode, enable_debug), splitter: SentenceSplitter::new().with_checker(dict.lexicon()), @@ -100,7 +100,9 @@ impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> AnalyzeSplitted<'a, } } -impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> Analysis for AnalyzeSplitted<'a, D, O> { +impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> Analysis + for AnalyzeSplitted<'a, D, O> +{ fn analyze(&mut self, input: &str, writer: &mut Writer) { for (_, sent) in self.splitter.split(input) { self.inner.analyze(sent, writer); diff --git a/sudachi-cli/src/build.rs b/sudachi-cli/src/build.rs index eea11ecf..4c92dc53 100644 --- a/sudachi-cli/src/build.rs +++ b/sudachi-cli/src/build.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -172,12 +172,12 @@ fn output_file(p: &Path) -> File { OpenOptions::new() .write(true) .create_new(true) - .open(&p) + .open(p) .unwrap_or_else(|e| panic!("failed to open {:?} for writing:\n{:?}", p, e)) } fn dump_part(dict: PathBuf, part: String, output: PathBuf) { - let file = File::open(&dict).expect("open failed"); + let file = File::open(dict).expect("open failed"); let data = unsafe { Mmap::map(&file) }.expect("mmap failed"); let loader = unsafe { DictionaryLoader::read_any_dictionary(&data) }.expect("failed to load dictionary"); @@ -215,7 +215,7 @@ fn dump_matrix(grammar: &Grammar, w: &mut W) { for left in 0..conn.num_left() { for right in 0..conn.num_right() { let cost = conn.cost(left as _, right as _); - write!(w, "{} {} {}\n", left, right, cost).unwrap(); + writeln!(w, "{} {} {}", left, right, cost).unwrap(); } } } diff --git a/sudachi-cli/src/main.rs b/sudachi-cli/src/main.rs index 69aabc7b..390f7a3a 100644 --- a/sudachi-cli/src/main.rs +++ b/sudachi-cli/src/main.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,16 +27,15 @@ use clap::Parser; use crate::analysis::{Analysis, AnalyzeNonSplitted, AnalyzeSplitted, SplitSentencesOnly}; use crate::build::{build_main, is_build_mode, BuildCli}; +use sudachi::analysis::stateless_tokenizer::DictionaryAccess; use sudachi::config::Config; use sudachi::dic::dictionary::JapaneseDictionary; use sudachi::prelude::*; -#[cfg(feature = "bake_dictionary")] -const BAKED_DICTIONARY_BYTES: &[u8] = include_bytes!(env!("SUDACHI_DICT_PATH")); - -#[derive(Clone, Debug, Eq, PartialEq)] +#[derive(Clone, Copy, Debug, Eq, PartialEq, Default)] pub enum SentenceSplitMode { /// Do both sentence splitting and analysis + #[default] Default, /// Do only sentence splitting and not analysis Only, @@ -44,12 +43,6 @@ pub enum SentenceSplitMode { None, } -impl Default for SentenceSplitMode { - fn default() -> Self { - SentenceSplitMode::Default - } -} - impl FromStr for SentenceSplitMode { type Err = &'static str; @@ -122,16 +115,33 @@ struct Cli { command: Option, } -// want to instantiate a different type for different output format -// this takes a f as a function which will be created with a different actual type -macro_rules! with_output { - ($cli: expr, $f: expr) => { - if $cli.wakati { - Box::new($f(output::Wakachi::default())) - } else { - Box::new($f(output::Simple::new($cli.print_all))) - } - }; +pub fn setup_output( + wakachi: bool, + print_all: bool, +) -> Box> { + if wakachi { + Box::new(output::Wakachi::default()) + } else { + Box::new(output::Simple::new(print_all)) + } +} + +fn setup_analyzer<'a>(args: &Cli, dict: &'a impl DictionaryAccess) -> Box { + match args.split_sentences { + SentenceSplitMode::Only => Box::new(SplitSentencesOnly::new(dict)), + SentenceSplitMode::Default => Box::new(AnalyzeSplitted::new( + setup_output(args.wakati, args.print_all), + dict, + args.mode, + args.enable_debug, + )), + SentenceSplitMode::None => Box::new(AnalyzeNonSplitted::new( + setup_output(args.wakati, args.print_all), + dict, + args.mode, + args.enable_debug, + )), + } } fn main() { @@ -156,7 +166,7 @@ fn main() { // output: stdout or file let inner_writer: Box = match &args.output_file { Some(output_path) => Box::new( - File::create(&output_path) + File::create(output_path) .unwrap_or_else(|_| panic!("Failed to open output file {:?}", &output_path)), ), None => Box::new(io::stdout()), @@ -174,15 +184,7 @@ fn main() { let dict = JapaneseDictionary::from_cfg(&config) .unwrap_or_else(|e| panic!("Failed to create dictionary: {:?}", e)); - let mut analyzer: Box = match args.split_sentences { - SentenceSplitMode::Only => Box::new(SplitSentencesOnly::new(&dict)), - SentenceSplitMode::Default => with_output!(args, |o| { - AnalyzeSplitted::new(o, &dict, args.mode, args.enable_debug) - }), - SentenceSplitMode::None => with_output!(args, |o| { - AnalyzeNonSplitted::new(o, &dict, args.mode, args.enable_debug) - }), - }; + let mut analyzer: Box = setup_analyzer(&args, &dict); let mut data = String::with_capacity(4 * 1024); let is_stdout = args.output_file.is_none(); @@ -207,10 +209,10 @@ fn strip_eol(data: &str) -> &str { let mut bytes = data.as_bytes(); let mut len = bytes.len(); if len > 1 && bytes[len - 1] == b'\n' { - len = len - 1; + len -= 1; bytes = &bytes[..len]; if len > 1 && bytes[len - 1] == b'\r' { - len = len - 1; + len -= 1; bytes = &bytes[..len]; } } @@ -218,6 +220,7 @@ fn strip_eol(data: &str) -> &str { // Safety: str was correct and we only removed full characters unsafe { std::str::from_utf8_unchecked(bytes) } } + #[cfg(test)] mod tests { use clap::CommandFactory; diff --git a/sudachi-cli/src/output.rs b/sudachi-cli/src/output.rs index d242ce53..94871dab 100644 --- a/sudachi-cli/src/output.rs +++ b/sudachi-cli/src/output.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ impl Wakachi { impl SudachiOutput for Wakachi { fn write(&self, writer: &mut Writer, morphemes: &MorphemeList) -> SudachiResult<()> { - if morphemes.len() == 0 { + if morphemes.is_empty() { writer.write_all(b"\n")?; return Ok(()); } diff --git a/sudachi/src/analysis/created.rs b/sudachi/src/analysis/created.rs index 87313bbe..2bce363f 100644 --- a/sudachi/src/analysis/created.rs +++ b/sudachi/src/analysis/created.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,7 +40,7 @@ impl CreatedWords { const MAX_SHIFT: Carrier = CreatedWords::MAX_VALUE - 1; pub fn empty() -> CreatedWords { - return Default::default(); + Default::default() } pub fn single>(length: Pos) -> CreatedWords { @@ -55,7 +55,7 @@ impl CreatedWords { #[must_use] pub fn add_word>(&self, length: P) -> CreatedWords { let mask = CreatedWords::single(length); - return self.add(mask); + self.add(mask) } #[must_use] @@ -67,21 +67,19 @@ impl CreatedWords { let mask = CreatedWords::single(length); if (self.0 & mask.0) == 0 { HasWord::No + } else if length.into() >= CreatedWords::MAX_VALUE as _ { + HasWord::Maybe } else { - if length.into() >= CreatedWords::MAX_VALUE as _ { - HasWord::Maybe - } else { - HasWord::Yes - } + HasWord::Yes } } pub fn is_empty(&self) -> bool { - return self.0 == 0; + self.0 == 0 } pub fn not_empty(&self) -> bool { - return !self.is_empty(); + !self.is_empty() } } diff --git a/sudachi/src/analysis/lattice.rs b/sudachi/src/analysis/lattice.rs index 7d1c4a97..a8eb93da 100644 --- a/sudachi/src/analysis/lattice.rs +++ b/sudachi/src/analysis/lattice.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,6 +70,7 @@ impl VNode { /// the size of vectors never shrink. /// You must use the size parameter to check the current size and never /// access vectors after the end. +#[derive(Default)] pub struct Lattice { ends: Vec>, ends_full: Vec>, @@ -78,18 +79,6 @@ pub struct Lattice { size: usize, } -impl Default for Lattice { - fn default() -> Self { - Lattice { - ends: Vec::new(), - ends_full: Vec::new(), - indices: Vec::new(), - eos: None, - size: 0, - } - } -} - impl Lattice { fn reset_vec(data: &mut Vec>, target: usize) { for v in data.iter_mut() { @@ -242,8 +231,7 @@ impl Lattice { for boundary in (0..self.indices.len()).rev() { let nodes = &self.ends_full[boundary]; - for node_idx in 0..nodes.len() { - let r_node = &nodes[node_idx]; + for r_node in nodes { let (surface, pos) = if r_node.is_special_node() { ("(null)", PosData::Bos) } else if r_node.is_oov() { @@ -282,7 +270,7 @@ impl Lattice { write!(out, " {}", connect_cost)?; } - write!(out, "\n")?; + writeln!(out)?; dump_idx += 1; } diff --git a/sudachi/src/analysis/mlist.rs b/sudachi/src/analysis/mlist.rs index 2d1a76f0..2e80cb56 100644 --- a/sudachi/src/analysis/mlist.rs +++ b/sudachi/src/analysis/mlist.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -93,11 +93,7 @@ impl MorphemeList { match self.input.try_borrow_mut() { Ok(mut i) => { let mref = i.deref_mut(); - analyzer.swap_result( - &mut mref.input, - &mut self.nodes.mut_data(), - &mut mref.subset, - ); + analyzer.swap_result(&mut mref.input, self.nodes.mut_data(), &mut mref.subset); Ok(()) } Err(_) => Err(SudachiError::MorphemeListBorrowed), diff --git a/sudachi/src/analysis/morpheme.rs b/sudachi/src/analysis/morpheme.rs index c412e66b..39929c63 100644 --- a/sudachi/src/analysis/morpheme.rs +++ b/sudachi/src/analysis/morpheme.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -91,21 +91,21 @@ impl<'a, T: DictionaryAccess> Morpheme<'a, T> { /// /// "Dictionary form" means a word's lemma and "終止形" in Japanese. pub fn dictionary_form(&self) -> &str { - &self.get_word_info().dictionary_form() + self.get_word_info().dictionary_form() } /// Returns the normalized form of morpheme /// /// This method returns the form normalizing inconsistent spellings and inflected forms pub fn normalized_form(&self) -> &str { - &self.get_word_info().normalized_form() + self.get_word_info().normalized_form() } /// Returns the reading form of morpheme. /// /// Returns Japanese syllabaries 'フリガナ' in katakana. pub fn reading_form(&self) -> &str { - &self.get_word_info().reading_form() + self.get_word_info().reading_form() } /// Returns if this morpheme is out of vocabulary @@ -131,7 +131,7 @@ impl<'a, T: DictionaryAccess> Morpheme<'a, T> { } pub fn synonym_group_ids(&self) -> &[u32] { - &self.get_word_info().synonym_group_ids() + self.get_word_info().synonym_group_ids() } pub fn get_word_info(&self) -> &WordInfo { diff --git a/sudachi/src/analysis/node.rs b/sudachi/src/analysis/node.rs index 2e840080..fa2537d3 100644 --- a/sudachi/src/analysis/node.rs +++ b/sudachi/src/analysis/node.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -186,8 +186,8 @@ impl ResultNode { text: &'a InputBuffer, ) -> NodeSplitIterator<'a> { let splits: &[WordId] = match mode { - Mode::A => &self.word_info.a_unit_split(), - Mode::B => &self.word_info.b_unit_split(), + Mode::A => self.word_info.a_unit_split(), + Mode::B => self.word_info.b_unit_split(), Mode::C => panic!("splitting Node with Mode::C is not supported"), }; diff --git a/sudachi/src/analysis/stateful_tokenizer.rs b/sudachi/src/analysis/stateful_tokenizer.rs index fa69402e..64155cc4 100644 --- a/sudachi/src/analysis/stateful_tokenizer.rs +++ b/sudachi/src/analysis/stateful_tokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -88,7 +88,7 @@ impl StatefulTokenizer { /// Return current analysis mode pub fn mode(&self) -> Mode { - return self.mode; + self.mode } /// Analyzer will read only following [`WordInfo`] field subset @@ -105,7 +105,9 @@ impl StatefulTokenizer { /// Prepare StatefulTokenizer for the next data. /// Data must be written in the returned reference. pub fn reset(&mut self) -> &mut String { - self.top_path.as_mut().map(|p| p.clear()); + if let Some(p) = self.top_path.as_mut() { + p.clear() + }; self.oov.clear(); self.input.reset() } @@ -169,7 +171,7 @@ impl StatefulTokenizer { /// Resolve the path (as ResultNodes) with the smallest cost fn resolve_best_path(&mut self) -> SudachiResult> { let lex = self.dictionary.lexicon(); - let mut path = std::mem::replace(&mut self.top_path, None).unwrap_or_else(|| Vec::new()); + let mut path = self.top_path.take().unwrap_or_default(); self.lattice.fill_top_path(&mut self.top_path_ids); self.top_path_ids.reverse(); for pid in self.top_path_ids.drain(..) { diff --git a/sudachi/src/analysis/stateless_tokenizer.rs b/sudachi/src/analysis/stateless_tokenizer.rs index 5670d217..eabee275 100644 --- a/sudachi/src/analysis/stateless_tokenizer.rs +++ b/sudachi/src/analysis/stateless_tokenizer.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -132,8 +132,8 @@ pub(super) fn split_path( Ok(new_path) } -pub(super) fn dump_path(path: &Vec) { - for (i, node) in (&path).iter().enumerate() { +pub(super) fn dump_path(path: &[ResultNode]) { + for (i, node) in path.iter().enumerate() { println!("{}: {}", i, node); } } diff --git a/sudachi/src/config.rs b/sudachi/src/config.rs index e5e50a14..0a1f0344 100644 --- a/sudachi/src/config.rs +++ b/sudachi/src/config.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,9 +60,9 @@ struct PathResolver { impl PathResolver { fn with_capacity(capacity: usize) -> PathResolver { - return PathResolver { + PathResolver { roots: Vec::with_capacity(capacity), - }; + } } fn add>(&mut self, path: P) { @@ -71,7 +71,7 @@ impl PathResolver { fn contains>(&self, path: P) -> bool { let query = path.as_ref(); - return self.roots.iter().find(|p| p.as_path() == query).is_some(); + return self.roots.iter().any(|p| p.as_path() == query); } pub fn first_existing + Clone>(&self, path: P) -> Option { @@ -95,13 +95,14 @@ impl PathResolver { } pub fn roots(&self) -> &[PathBuf] { - return &self.roots; + &self.roots } } -#[derive(Deserialize, Clone, Copy, Debug, Eq, PartialEq)] +#[derive(Deserialize, Clone, Copy, Debug, Default, Eq, PartialEq)] #[serde(rename_all = "snake_case")] pub enum SurfaceProjection { + #[default] Surface, Normalized, Reading, @@ -111,12 +112,6 @@ pub enum SurfaceProjection { NormalizedNouns, } -impl Default for SurfaceProjection { - fn default() -> Self { - SurfaceProjection::Surface - } -} - impl SurfaceProjection { /// Return required InfoSubset for the current projection type pub fn required_subset(&self) -> InfoSubset { @@ -292,14 +287,14 @@ impl ConfigBuilder { Config { resolver, system_dict: self.systemDict, - user_dicts: self.userDict.unwrap_or_else(|| Vec::new()), + user_dicts: self.userDict.unwrap_or_default(), character_definition_file, - connection_cost_plugins: self.connectionCostPlugin.unwrap_or(Vec::new()), - input_text_plugins: self.inputTextPlugin.unwrap_or(Vec::new()), - oov_provider_plugins: self.oovProviderPlugin.unwrap_or(Vec::new()), - path_rewrite_plugins: self.pathRewritePlugin.unwrap_or(Vec::new()), - projection: self.projection.unwrap_or(SurfaceProjection::Surface), + connection_cost_plugins: self.connectionCostPlugin.unwrap_or_default(), + input_text_plugins: self.inputTextPlugin.unwrap_or_default(), + oov_provider_plugins: self.oovProviderPlugin.unwrap_or_default(), + path_rewrite_plugins: self.pathRewritePlugin.unwrap_or_default(), + projection: self.projection.unwrap_or_default(), } } @@ -416,7 +411,7 @@ impl Config { } // Report an error - return Err(self.resolver.resolution_failure(&file_path)); + Err(self.resolver.resolution_failure(&file_path)) } pub fn resolved_system_dict(&self) -> Result { diff --git a/sudachi/src/dic/build/conn.rs b/sudachi/src/dic/build/conn.rs index d73cd17e..338e4f59 100644 --- a/sudachi/src/dic/build/conn.rs +++ b/sudachi/src/dic/build/conn.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -138,7 +138,7 @@ impl ConnBuffer { } fn parse_header(&mut self) -> DicWriteResult<(i16, i16)> { - let mut items = SPLIT_REGEX.splitn(&self.line.trim(), 2); + let mut items = SPLIT_REGEX.splitn(self.line.trim(), 2); // TODO: fix get_next error message let left = it_next(&self.line, &mut items, "left_num", parse_i16)?; let right = it_next(&self.line, &mut items, "right_num", parse_i16)?; @@ -146,7 +146,7 @@ impl ConnBuffer { } fn parse_line(&mut self) -> DicWriteResult<()> { - let mut items = SPLIT_REGEX.splitn(&self.line.trim(), 3); + let mut items = SPLIT_REGEX.splitn(self.line.trim(), 3); let left = it_next(&self.line, &mut items, "left", parse_i16)?; let right = it_next(&self.line, &mut items, "right", parse_i16)?; let cost = it_next(&self.line, &mut items, "cost", parse_i16)?; @@ -164,12 +164,12 @@ impl ConnBuffer { } fn num_error(part: &'static str, value: i16) -> SudachiResult { - return Err(DicBuildError { + Err(DicBuildError { file: "".to_owned(), line: 0, cause: BuildFailure::InvalidConnSize(part, value), } - .into()); + .into()) } #[cfg(test)] diff --git a/sudachi/src/dic/build/error.rs b/sudachi/src/dic/build/error.rs index 169604f7..487adf98 100644 --- a/sudachi/src/dic/build/error.rs +++ b/sudachi/src/dic/build/error.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -95,20 +95,12 @@ pub enum BuildFailure { TrieBuildFailure, } +#[derive(Default)] pub(crate) struct DicCompilationCtx { name: String, line: usize, } -impl Default for DicCompilationCtx { - fn default() -> Self { - DicCompilationCtx { - name: Default::default(), - line: Default::default(), - } - } -} - impl DicCompilationCtx { pub fn memory() -> Self { DicCompilationCtx { diff --git a/sudachi/src/dic/build/lexicon.rs b/sudachi/src/dic/build/lexicon.rs index e0f15e07..6514c4e7 100644 --- a/sudachi/src/dic/build/lexicon.rs +++ b/sudachi/src/dic/build/lexicon.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -84,7 +84,7 @@ impl StrPosEntry { Self { data: owned } } - pub fn from_built_pos(data: &Vec) -> Self { + pub fn from_built_pos(data: &[String]) -> Self { let mut iter = data.iter().map(|x| x.as_str()); let p1 = Cow::Borrowed(iter.next().unwrap()); let p2 = Cow::Borrowed(iter.next().unwrap()); @@ -146,7 +146,7 @@ pub(crate) trait SplitUnitResolver { surface, pos, reading, - } => self.resolve_inline(&surface, *pos, reading.as_deref()), + } => self.resolve_inline(surface, *pos, reading.as_deref()), } } @@ -206,7 +206,7 @@ impl RawLexiconEntry { ) -> DicWriteResult { let mut size = 0; - size += u16w.write(w, &self.headword())?; // surface of WordInfo + size += u16w.write(w, self.headword())?; // surface of WordInfo size += u16w.write_len(w, self.surface.len())?; // surface for trie w.write_all(&self.pos.to_le_bytes())?; size += 2; @@ -297,17 +297,21 @@ impl LexiconReader { .trim(Trim::None) .flexible(true) .from_reader(data); - let mut record = StringRecord::new(); let mut nread = 0; - while reader.read_record(&mut record).map_err(|e| { - let line = e.position().map_or(0, |p| p.line()); - self.ctx.set_line(line as usize); - self.ctx.to_sudachi_err(BuildFailure::CsvError(e)) - })? { - let line = record.position().map_or(0, |p| p.line()) as usize; - self.ctx.set_line(line); - self.read_record(&record)?; - nread += 1; + for record in reader.records() { + match record { + Ok(r) => { + let line = r.position().map_or(0, |p| p.line()) as usize; + self.ctx.set_line(line); + self.read_record(&r)?; + nread += 1; + } + Err(e) => { + let line = e.position().map_or(0, |p| p.line()) as usize; + self.ctx.set_line(line); + return Err(self.ctx.to_sudachi_err(BuildFailure::CsvError(e))); + } + } } Ok(nread) } @@ -344,12 +348,10 @@ impl LexiconReader { let pos = rec.ctx.transform(self.pos_of([p1, p2, p3, p4, p5, p6]))?; - if splitting == Mode::A { - if !split_a.is_empty() || !split_b.is_empty() { - return rec.ctx.err(BuildFailure::InvalidSplit( - "A-mode tokens can't have splits".to_owned(), - )); - } + if splitting == Mode::A && (!split_a.is_empty() || !split_b.is_empty()) { + return rec.ctx.err(BuildFailure::InvalidSplit( + "A-mode tokens can't have splits".to_owned(), + )); } self.unresolved += resolve_a + resolve_b; @@ -497,7 +499,7 @@ impl LexiconReader { if WORD_ID_LITERAL.is_match(data) { Ok(SplitUnit::Ref(parse_wordid(data)?)) } else { - let mut iter = data.splitn(8, ","); + let mut iter = data.splitn(8, ','); let surface = it_next(data, &mut iter, "(1) surface", unescape)?; let p1 = it_next(data, &mut iter, "(2) pos-1", unescape_cow)?; let p2 = it_next(data, &mut iter, "(3) pos-2", unescape_cow)?; @@ -541,8 +543,7 @@ impl LexiconReader { resolver: &R, ) -> Result { let mut total = 0; - let mut line: usize = 0; - for e in self.entries.iter_mut() { + for (line, e) in self.entries.iter_mut().enumerate() { for s in e.splits_a.iter_mut() { match Self::resolve_split(s, resolver) { Some(val) => total += val, @@ -569,7 +570,6 @@ impl LexiconReader { } } } - line += 1; } Ok(total) } diff --git a/sudachi/src/dic/build/lexicon/test.rs b/sudachi/src/dic/build/lexicon/test.rs index 3a62c084..6039fc00 100644 --- a/sudachi/src/dic/build/lexicon/test.rs +++ b/sudachi/src/dic/build/lexicon/test.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -105,7 +105,7 @@ fn parse_kyoto_ignored() { let entries = rdr.entries(); assert_eq!(entries.len(), 1); let kyoto = &entries[0]; - assert_eq!(false, kyoto.should_index()); + assert!(!kyoto.should_index()); } #[test] @@ -155,9 +155,9 @@ fn parse_pos_exhausted() { let mut rdr = LexiconReader::new(); let mut data = String::new(); for i in 0..=MAX_POS_IDS + 1 { - write!( + writeln!( data, - "x,-1,-1,5293,京都,名詞,固有名詞,地名,一般,*,{},キョウト,京都,*,A,*,*,*,*\n", + "x,-1,-1,5293,京都,名詞,固有名詞,地名,一般,*,{},キョウト,京都,*,A,*,*,*,*", i ) .unwrap() diff --git a/sudachi/src/dic/build/mod.rs b/sudachi/src/dic/build/mod.rs index 2cd1c49a..dbbdda1f 100644 --- a/sudachi/src/dic/build/mod.rs +++ b/sudachi/src/dic/build/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -109,15 +109,15 @@ impl DictionaryAccess for NoDic { } fn input_text_plugins(&self) -> &[Box] { - return &[]; + &[] } fn oov_provider_plugins(&self) -> &[Box] { - return &[]; + &[] } fn path_rewrite_plugins(&self) -> &[Box] { - return &[]; + &[] } } @@ -305,7 +305,7 @@ impl DictBuilder { } /// this function must only be used in resolve_impl - fn unsafe_make_resolver<'a, 'b>(&'a self) -> RawDictResolver<'b> { + fn unsafe_make_resolver<'a>(&self) -> RawDictResolver<'a> { let resolver = RawDictResolver::new(self.lexicon.entries(), self.user); // resolver borrows parts of entries, but it does not touch splits // resolve function only modifies splits diff --git a/sudachi/src/dic/build/parse.rs b/sudachi/src/dic/build/parse.rs index ecf4d32e..c3cb4ae1 100644 --- a/sudachi/src/dic/build/parse.rs +++ b/sudachi/src/dic/build/parse.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -95,8 +95,8 @@ pub(crate) fn parse_dic_form(data: &str) -> DicWriteResult { #[inline] pub(crate) fn parse_wordid(data: &str) -> DicWriteResult { - if data.starts_with("U") { - let wid = parse_wordid_raw(&data[1..]); + if let Some(rest) = data.strip_prefix('U') { + let wid = parse_wordid_raw(rest); wid.map(|w| WordId::new(1, w.word())) } else { parse_wordid_raw(data) @@ -143,7 +143,7 @@ where { let mut result = Vec::with_capacity(4); - for part in data.split("/") { + for part in data.split('/') { result.push(f(part)?); } @@ -179,7 +179,7 @@ pub(crate) fn unescape_cow(data: &str) -> DicWriteResult> { if !UNICODE_LITERAL.is_match(data) { Ok(Cow::Borrowed(data)) } else { - unescape_slow(data).map(|s| Cow::Owned(s)) + unescape_slow(data).map(Cow::Owned) } } diff --git a/sudachi/src/dic/build/primitives.rs b/sudachi/src/dic/build/primitives.rs index d1b83c9e..2ee91e69 100644 --- a/sudachi/src/dic/build/primitives.rs +++ b/sudachi/src/dic/build/primitives.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ impl Utf16Writer { w.write_all(&[length as u8])?; 1 } else { - let b0 = (length as u8) & 0xff; + let b0 = length as u8; let b1 = ((length >> 8) as u8) | 0x80; w.write_all(&[b1, b0])?; 2 diff --git a/sudachi/src/dic/build/resolve.rs b/sudachi/src/dic/build/resolve.rs index 3160bbab..19d4e9be 100644 --- a/sudachi/src/dic/build/resolve.rs +++ b/sudachi/src/dic/build/resolve.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,18 +23,20 @@ use crate::error::SudachiResult; use crate::util::fxhash::FxBuildHasher; use std::collections::HashMap; +// HashMap from surface to (pos_id, reading_form, word-id)s +type ResolutionCandidateMap = HashMap, WordId)>, FxBuildHasher>; + /// We can't use trie to resolve splits because it is possible that refs are not in trie /// This resolver has to be owning because the dictionary content is lazily loaded and transient pub struct BinDictResolver { - index: HashMap, WordId)>, FxBuildHasher>, + index: ResolutionCandidateMap, } impl BinDictResolver { pub fn new(dict: D) -> SudachiResult { let lex = dict.lexicon(); let size = lex.size(); - let mut index: HashMap, WordId)>, FxBuildHasher> = - HashMap::default(); + let mut index: ResolutionCandidateMap = HashMap::default(); for id in 0..size { let wid = WordId::new(0, id); let winfo: WordInfoData = lex @@ -77,13 +79,12 @@ impl SplitUnitResolver for BinDictResolver { } pub struct RawDictResolver<'a> { - data: HashMap<&'a str, Vec<(u16, Option<&'a str>, WordId)>, FxBuildHasher>, + data: ResolutionCandidateMap<&'a str>, } impl<'a> RawDictResolver<'a> { pub(crate) fn new(entries: &'a [RawLexiconEntry], user: bool) -> Self { - let mut data: HashMap<&'a str, Vec<(u16, Option<&'a str>, WordId)>, FxBuildHasher> = - HashMap::default(); + let mut data: ResolutionCandidateMap<&'a str> = HashMap::default(); let dic_id = if user { 1 } else { 0 }; diff --git a/sudachi/src/dic/build/test/mod.rs b/sudachi/src/dic/build/test/mod.rs index 5710c42d..f2ee3e98 100644 --- a/sudachi/src/dic/build/test/mod.rs +++ b/sudachi/src/dic/build/test/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ use crate::dic::DictionaryLoader; use crate::error::SudachiError; use std::io::sink; -static MATRIX_10_10: &'static [u8] = include_bytes!("matrix_10x10.def"); +static MATRIX_10_10: &[u8] = include_bytes!("matrix_10x10.def"); #[test] fn build_grammar() { diff --git a/sudachi/src/dic/build/test/with_analysis.rs b/sudachi/src/dic/build/test/with_analysis.rs index 18f94a4d..12898bd3 100644 --- a/sudachi/src/dic/build/test/with_analysis.rs +++ b/sudachi/src/dic/build/test/with_analysis.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -78,14 +78,9 @@ impl ConfigTestSupport { let mut result = String::new(); - loop { - match data.next() { - Some(x) => { - write!(result, "{:?}{}", prev, sep).unwrap(); - prev = x; - } - None => break, - } + for x in data { + write!(result, "{:?}{}", prev, sep).unwrap(); + prev = x; } write!(result, "{:?}", prev).unwrap(); result diff --git a/sudachi/src/dic/character_category.rs b/sudachi/src/dic/character_category.rs index e13fb0cd..a31ef7da 100644 --- a/sudachi/src/dic/character_category.rs +++ b/sudachi/src/dic/character_category.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -144,10 +144,7 @@ impl CharacterCategory { } let mut categories = CategoryType::empty(); - for elem in cols[1..] - .iter() - .take_while(|elem| elem.chars().next().unwrap() != '#') - { + for elem in cols[1..].iter().take_while(|elem| !elem.starts_with('#')) { categories.insert(match elem.parse() { Ok(t) => t, Err(_) => { @@ -287,7 +284,7 @@ impl Iterator for CharCategoryIter<'_> { (left..char::MAX, *self.categories.categories.last().unwrap()) } else if self.current == 0 { let right = char::from_u32(*self.categories.boundaries.first().unwrap()).unwrap(); - let r = (0 as char)..right as char; + let r = (0 as char)..right; (r, self.categories.categories[0]) } else { let left = char::from_u32(self.categories.boundaries[self.current - 1]).unwrap(); diff --git a/sudachi/src/dic/dictionary.rs b/sudachi/src/dic/dictionary.rs index a7b95d87..9cef2564 100644 --- a/sudachi/src/dic/dictionary.rs +++ b/sudachi/src/dic/dictionary.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,7 +87,6 @@ impl JapaneseDictionary { let plugins = { let grammar = &mut basic_dict.grammar; - let cfg = &*cfg; Plugins::load(cfg, grammar)? }; @@ -116,12 +115,12 @@ impl JapaneseDictionary { } /// Returns grammar with the correct lifetime - pub fn grammar<'a>(&'a self) -> &Grammar<'a> { + pub fn grammar(&self) -> &Grammar<'_> { &self._grammar } /// Returns lexicon with the correct lifetime - pub fn lexicon<'a>(&'a self) -> &LexiconSet<'a> { + pub fn lexicon(&self) -> &LexiconSet<'_> { &self._lexicon } diff --git a/sudachi/src/dic/grammar.rs b/sudachi/src/dic/grammar.rs index 1642a3bf..70da5ce3 100644 --- a/sudachi/src/dic/grammar.rs +++ b/sudachi/src/dic/grammar.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,13 +27,15 @@ use nom::{ }; use std::ops::Index; +type PosList = Vec>; + /// Dictionary grammar /// /// Contains part_of_speech list and connection cost map. /// It also holds character category. pub struct Grammar<'a> { _bytes: &'a [u8], - pub pos_list: Vec>, + pub pos_list: PosList, pub storage_size: usize, /// The mapping to overload cost table @@ -163,7 +165,7 @@ impl<'a> Grammar<'a> { } } -fn pos_list_parser(input: &[u8]) -> SudachiNomResult<&[u8], Vec>> { +fn pos_list_parser(input: &[u8]) -> SudachiNomResult<&[u8], PosList> { let (rest, pos_size) = le_u16(input)?; nom::multi::count( nom::multi::count(utf16_string_parser, POS_DEPTH), @@ -171,10 +173,7 @@ fn pos_list_parser(input: &[u8]) -> SudachiNomResult<&[u8], Vec>> { )(rest) } -fn grammar_parser( - input: &[u8], - offset: usize, -) -> SudachiNomResult<&[u8], (Vec>, i16, i16)> { +fn grammar_parser(input: &[u8], offset: usize) -> SudachiNomResult<&[u8], (PosList, i16, i16)> { nom::sequence::preceded( take(offset), nom::sequence::tuple((pos_list_parser, le_i16, le_i16)), @@ -259,14 +258,11 @@ mod tests { storage } fn string_to_bytes(s: &str) -> Vec { - s.encode_utf16() - .map(|c| c.to_le_bytes()) - .flatten() - .collect() + s.encode_utf16().flat_map(|c| c.to_le_bytes()).collect() } - fn build_partofspeech(storage: &mut Vec) -> () { + fn build_partofspeech(storage: &mut Vec) { // number of part of speech - storage.extend(&(3 as i16).to_le_bytes()); + storage.extend(&3_i16.to_le_bytes()); storage.extend( b"\x07B\x00O\x00S\x00/\x00E\x00O\x00S\x00\x01*\x00\x01*\x00\x01*\x00\x01*\x00\x01*\x00", @@ -287,20 +283,20 @@ mod tests { storage.extend(b"\x06"); storage.extend(string_to_bytes("終止形-一般")); } - fn build_connect_table(storage: &mut Vec) -> () { - storage.extend(&(3 as i16).to_le_bytes()); - storage.extend(&(3 as i16).to_le_bytes()); + fn build_connect_table(storage: &mut Vec) { + storage.extend(&3_i16.to_le_bytes()); + storage.extend(&3_i16.to_le_bytes()); - storage.extend(&(0 as i16).to_le_bytes()); - storage.extend(&(-300 as i16).to_le_bytes()); - storage.extend(&(300 as i16).to_le_bytes()); + storage.extend(&0_i16.to_le_bytes()); + storage.extend(&(-300_i16).to_le_bytes()); + storage.extend(&300_i16.to_le_bytes()); - storage.extend(&(300 as i16).to_le_bytes()); - storage.extend(&(-500 as i16).to_le_bytes()); - storage.extend(&(-100 as i16).to_le_bytes()); + storage.extend(&300_i16.to_le_bytes()); + storage.extend(&(-500_i16).to_le_bytes()); + storage.extend(&(-100_i16).to_le_bytes()); - storage.extend(&(-3000 as i16).to_le_bytes()); - storage.extend(&(200 as i16).to_le_bytes()); - storage.extend(&(2000 as i16).to_le_bytes()); + storage.extend(&(-3000_i16).to_le_bytes()); + storage.extend(&200_i16.to_le_bytes()); + storage.extend(&2000_i16.to_le_bytes()); } } diff --git a/sudachi/src/dic/header.rs b/sudachi/src/dic/header.rs index 3df2e0e5..306dd0e3 100644 --- a/sudachi/src/dic/header.rs +++ b/sudachi/src/dic/header.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -119,6 +119,12 @@ pub struct Header { pub description: String, } +impl Default for Header { + fn default() -> Self { + Self::new() + } +} + impl Header { const DESCRIPTION_SIZE: usize = 256; pub const STORAGE_SIZE: usize = 8 + 8 + Header::DESCRIPTION_SIZE; @@ -164,21 +170,21 @@ impl Header { /// Returns if this header version has grammar pub fn has_grammar(&self) -> bool { - match self.version { - HeaderVersion::SystemDict(_) => true, - HeaderVersion::UserDict(UserDictVersion::Version2) => true, - HeaderVersion::UserDict(UserDictVersion::Version3) => true, - _ => false, - } + matches!( + self.version, + HeaderVersion::SystemDict(_) + | HeaderVersion::UserDict(UserDictVersion::Version2) + | HeaderVersion::UserDict(UserDictVersion::Version3) + ) } /// Returns if this header version has synonym group ids pub fn has_synonym_group_ids(&self) -> bool { - match self.version { - HeaderVersion::SystemDict(SystemDictVersion::Version2) => true, - HeaderVersion::UserDict(UserDictVersion::Version3) => true, - _ => false, - } + matches!( + self.version, + HeaderVersion::SystemDict(SystemDictVersion::Version2) + | HeaderVersion::UserDict(UserDictVersion::Version3) + ) } pub fn write_to(&self, w: &mut W) -> SudachiResult { @@ -191,7 +197,7 @@ impl Header { w.write_all(&self.version.to_u64().to_le_bytes())?; w.write_all(&self.create_time.to_le_bytes())?; - w.write_all(&self.description.as_bytes())?; + w.write_all(self.description.as_bytes())?; for _ in 0..Header::DESCRIPTION_SIZE - self.description.len() { w.write_all(&[0])?; } @@ -204,7 +210,7 @@ fn nul_terminated_str_from_slice(buf: &[u8]) -> String { let str_bytes: &[u8] = if let Some(nul_idx) = buf.iter().position(|b| *b == 0) { &buf[..nul_idx] } else { - &buf + buf }; String::from_utf8_lossy(str_bytes).to_string() } diff --git a/sudachi/src/dic/lexicon/mod.rs b/sudachi/src/dic/lexicon/mod.rs index 20043e81..f6e9f599 100644 --- a/sudachi/src/dic/lexicon/mod.rs +++ b/sudachi/src/dic/lexicon/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -110,7 +110,7 @@ impl<'a> Lexicon<'a> { #[inline] fn word_id(&self, raw_id: u32) -> WordId { - return WordId::new(self.lex_id, raw_id); + WordId::new(self.lex_id, raw_id) } /// Returns an iterator of word_id and end of words that matches given input diff --git a/sudachi/src/dic/lexicon/trie.rs b/sudachi/src/dic/lexicon/trie.rs index d6e76c31..2dfc0d3e 100644 --- a/sudachi/src/dic/lexicon/trie.rs +++ b/sudachi/src/dic/lexicon/trie.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -100,7 +100,7 @@ impl<'a> Trie<'a> { } pub fn total_size(&self) -> usize { - 4 * self.array.len() as usize + 4 * self.array.len() } #[inline] diff --git a/sudachi/src/dic/lexicon/word_id_table.rs b/sudachi/src/dic/lexicon/word_id_table.rs index 48683497..bd79cfc5 100644 --- a/sudachi/src/dic/lexicon/word_id_table.rs +++ b/sudachi/src/dic/lexicon/word_id_table.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,10 +39,10 @@ impl<'a> WordIdTable<'a> { #[inline] pub fn entries(&self, index: usize) -> WordIdIter { debug_assert!(index < self.bytes.len()); - let ptr = unsafe { self.bytes.as_ptr().offset((index + self.offset) as isize) }; + let ptr = unsafe { self.bytes.as_ptr().add(index + self.offset) }; let cnt = unsafe { ptr.read() } as usize; let data_ptr = unsafe { ptr.offset(1) } as *const u32; - debug_assert!(index + cnt * std::mem::size_of::() + 1 <= self.bytes.len()); + debug_assert!(index + cnt * std::mem::size_of::() < self.bytes.len()); WordIdIter { data: unsafe { NonNull::new_unchecked(data_ptr as _) }, remaining: cnt, diff --git a/sudachi/src/dic/lexicon/word_infos.rs b/sudachi/src/dic/lexicon/word_infos.rs index 4be9cf85..8b6e22dd 100644 --- a/sudachi/src/dic/lexicon/word_infos.rs +++ b/sudachi/src/dic/lexicon/word_infos.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,6 @@ * limitations under the License. */ -use std::iter::FusedIterator; - -use crate::dic::lexicon_set::LexiconSet; use crate::dic::read::u32_parser; use crate::dic::read::word_info::WordInfoParser; use crate::dic::subset::InfoSubset; @@ -177,30 +174,3 @@ impl From for WordInfoData { info.data } } - -struct SplitIter<'a> { - index: usize, - split: &'a [WordId], - lexicon: &'a LexiconSet<'a>, -} - -impl Iterator for SplitIter<'_> { - type Item = SudachiResult; - - fn next(&mut self) -> Option { - let idx = self.index; - if idx >= self.split.len() { - None - } else { - self.index += 1; - Some(self.lexicon.get_word_info(self.split[idx])) - } - } - - fn size_hint(&self) -> (usize, Option) { - let rem = self.split.len() - self.index; - (rem, Some(rem)) - } -} - -impl FusedIterator for SplitIter<'_> {} diff --git a/sudachi/src/dic/lexicon_set.rs b/sudachi/src/dic/lexicon_set.rs index 32ffde1d..1f41632b 100644 --- a/sudachi/src/dic/lexicon_set.rs +++ b/sudachi/src/dic/lexicon_set.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,8 +116,8 @@ impl LexiconSet<'_> { let pos_id = word_info.pos_id as usize; if dict_id > 0 && pos_id >= self.num_system_pos { // user defined part-of-speech - word_info.pos_id = (pos_id as usize - self.num_system_pos - + self.pos_offsets[dict_id as usize]) as u16; + word_info.pos_id = + (pos_id - self.num_system_pos + self.pos_offsets[dict_id as usize]) as u16; } } @@ -142,7 +142,7 @@ impl LexiconSet<'_> { self.lexicons[dic_id].get_word_param(id.word()) } - fn update_dict_id(split: &mut Vec, dict_id: u8) -> SudachiResult<()> { + fn update_dict_id(split: &mut [WordId], dict_id: u8) -> SudachiResult<()> { for id in split.iter_mut() { let cur_dict_id = id.dic(); if cur_dict_id > 0 { diff --git a/sudachi/src/dic/mod.rs b/sudachi/src/dic/mod.rs index 65430939..04ac359f 100644 --- a/sudachi/src/dic/mod.rs +++ b/sudachi/src/dic/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -80,7 +80,9 @@ impl<'a> LoadedDictionary<'a> { let lexicon = other.lexicon; let grammar = other.grammar; self.lexicon_set.append(lexicon, npos)?; - grammar.map(|g| self.grammar.merge(g)); + if let Some(g) = grammar { + self.grammar.merge(g) + } Ok(self) } } @@ -117,6 +119,7 @@ pub struct DictionaryLoader<'a> { impl<'a> DictionaryLoader<'a> { /// Creates a binary dictionary from bytes /// + /// # Safety /// This function is marked unsafe because it does not perform header validation pub unsafe fn read_any_dictionary(dictionary_bytes: &[u8]) -> SudachiResult { let header = Header::parse(&dictionary_bytes[..Header::STORAGE_SIZE])?; diff --git a/sudachi/src/dic/read/mod.rs b/sudachi/src/dic/read/mod.rs index d218e8d3..494855e2 100644 --- a/sudachi/src/dic/read/mod.rs +++ b/sudachi/src/dic/read/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,7 +30,7 @@ pub fn u32_array_parser(input: &[u8]) -> SudachiNomResult<&[u8], Vec> { pub fn u32_wid_array_parser(input: &[u8]) -> SudachiNomResult<&[u8], Vec> { let (rest, length) = le_u8(input)?; - nom::multi::count(le_u32.map(|id| WordId::from_raw(id)), length as usize)(rest) + nom::multi::count(le_u32.map(WordId::from_raw), length as usize)(rest) } pub fn skip_wid_array(input: &[u8]) -> SudachiNomResult<&[u8], Vec> { diff --git a/sudachi/src/dic/storage.rs b/sudachi/src/dic/storage.rs index b724ccbc..d11e4610 100644 --- a/sudachi/src/dic/storage.rs +++ b/sudachi/src/dic/storage.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,7 +28,7 @@ impl AsRef<[u8]> for Storage { match self { Storage::File(m) => m.as_bytes(), Storage::Borrowed(b) => b, - Storage::Owned(v) => &v, + Storage::Owned(v) => v, } } } @@ -56,6 +56,8 @@ impl SudachiDicData { self.system.as_ref() } + /// # Safety + /// Call this function only after system dictionary data is ready. pub unsafe fn system_static_slice(&self) -> &'static [u8] { std::mem::transmute(self.system()) } diff --git a/sudachi/src/dic/word_id.rs b/sudachi/src/dic/word_id.rs index 32240093..5de0e69a 100644 --- a/sudachi/src/dic/word_id.rs +++ b/sudachi/src/dic/word_id.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,7 +57,7 @@ impl WordId { let dic_part = ((dic & 0xf) as u32) << 28; let word_part = word & WORD_MASK; let raw = dic_part | word_part; - return Self::from_raw(raw); + Self::from_raw(raw) } /// Creates the WordId with correctness checking @@ -84,12 +84,12 @@ impl WordId { /// Extract Dictionary ID pub fn dic(&self) -> u8 { - return (self.raw >> 28) as u8; + (self.raw >> 28) as u8 } /// Extract Word ID pub fn word(&self) -> u32 { - return self.raw & WORD_MASK; + self.raw & WORD_MASK } /// Check if the word comes from the system dictionary @@ -99,10 +99,7 @@ impl WordId { /// Check if the word comes from the user dictionary pub fn is_user(&self) -> bool { - match self.dic() { - 0 | 0xf => false, - _ => true, - } + !matches!(self.dic(), 0 | 0xf) } pub fn as_raw(&self) -> u32 { diff --git a/sudachi/src/error.rs b/sudachi/src/error.rs index 4e43b64a..34a10498 100644 --- a/sudachi/src/error.rs +++ b/sudachi/src/error.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ use std::fmt::Debug; -use std::io::Error; use thiserror::Error; use crate::config::ConfigError; @@ -50,7 +49,7 @@ pub enum SudachiError { FromUtf16(#[from] std::string::FromUtf16Error), #[error("Regex error")] - RegexError(#[from] fancy_regex::Error), + RegexError { cause: Box }, #[error("Error from nom {0}")] NomParseError(String), @@ -108,7 +107,7 @@ pub enum SudachiError { } impl From for SudachiError { - fn from(e: Error) -> Self { + fn from(e: std::io::Error) -> Self { SudachiError::Io { cause: e, context: String::from("IO Error"), @@ -116,6 +115,12 @@ impl From for SudachiError { } } +impl From for SudachiError { + fn from(e: fancy_regex::Error) -> Self { + SudachiError::RegexError { cause: Box::new(e) } + } +} + impl SudachiError { pub fn with_context>(self, ctx: S) -> Self { match self { diff --git a/sudachi/src/hash.rs b/sudachi/src/hash.rs index 041f3460..96987b19 100644 --- a/sudachi/src/hash.rs +++ b/sudachi/src/hash.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -91,7 +91,7 @@ impl Hasher for RoMuHash { #[inline(always)] fn write_u64(&mut self, i: u64) { - self.consume(i as u64); + self.consume(i); } #[inline(always)] diff --git a/sudachi/src/input_text/buffer/edit.rs b/sudachi/src/input_text/buffer/edit.rs index 300f73ab..41ac5232 100644 --- a/sudachi/src/input_text/buffer/edit.rs +++ b/sudachi/src/input_text/buffer/edit.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,7 +89,7 @@ impl<'a> InputEditor<'a> { // Current plugin implementations satisfy this criteria. pub fn resolve_edits( source: &str, - source_mapping: &Vec, + source_mapping: &[usize], target: &mut String, target_mapping: &mut Vec, edits: &mut Vec, @@ -127,7 +127,7 @@ pub fn resolve_edits( } fn add_replace( - source_mapping: &Vec, + source_mapping: &[usize], target: &mut String, target_mapping: &mut Vec, what: Range, diff --git a/sudachi/src/input_text/buffer/mod.rs b/sudachi/src/input_text/buffer/mod.rs index db2996af..7c0206a9 100644 --- a/sudachi/src/input_text/buffer/mod.rs +++ b/sudachi/src/input_text/buffer/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,19 +34,14 @@ const MAX_LENGTH: usize = u16::MAX as usize / 4 * 3; /// if the limit of the rewritten sentence is more than this number, then all bets are off const REALLY_MAX_LENGTH: usize = u16::MAX as usize; -#[derive(Eq, PartialEq, Debug, Clone)] +#[derive(Eq, PartialEq, Debug, Default, Clone)] enum BufferState { + #[default] Clean, RW, RO, } -impl Default for BufferState { - fn default() -> Self { - BufferState::Clean - } -} - /// InputBuffer - prepares the input data for the analysis /// /// By saying char we actually mean Unicode codepoint here. @@ -112,7 +107,7 @@ impl InputBuffer { /// Creates input from the passed string. Should be used mostly for tests. /// /// Panics if the input string is too long. - pub fn from<'a, T: AsRef>(data: T) -> InputBuffer { + pub fn from>(data: T) -> InputBuffer { let mut buf = Self::new(); buf.reset().push_str(data.as_ref()); buf.start_build().expect(""); diff --git a/sudachi/src/plugin/connect_cost/inhibit_connection.rs b/sudachi/src/plugin/connect_cost/inhibit_connection.rs index c664280d..c25aba41 100644 --- a/sudachi/src/plugin/connect_cost/inhibit_connection.rs +++ b/sudachi/src/plugin/connect_cost/inhibit_connection.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,8 +83,9 @@ mod tests { let right = 0; let bytes = build_mock_bytes(); let mut grammar = build_mock_grammar(&bytes); - let mut plugin = InhibitConnectionPlugin::default(); - plugin.inhibit_pairs = vec![(left, right)]; + let plugin = InhibitConnectionPlugin { + inhibit_pairs: vec![(left, right)], + }; plugin.edit(&mut grammar); assert_eq!( @@ -96,10 +97,10 @@ mod tests { fn build_mock_bytes() -> Vec { let mut buf = Vec::new(); // 0 - pos size, 1x1 connection with 0 element - buf.extend(&(0 as i16).to_le_bytes()); - buf.extend(&(1 as i16).to_le_bytes()); - buf.extend(&(1 as i16).to_le_bytes()); - buf.extend(&(0 as i16).to_le_bytes()); + buf.extend(&0_i16.to_le_bytes()); + buf.extend(&1_i16.to_le_bytes()); + buf.extend(&1_i16.to_le_bytes()); + buf.extend(&0_i16.to_le_bytes()); buf } diff --git a/sudachi/src/plugin/connect_cost/mod.rs b/sudachi/src/plugin/connect_cost/mod.rs index ad012354..eefbe335 100644 --- a/sudachi/src/plugin/connect_cost/mod.rs +++ b/sudachi/src/plugin/connect_cost/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ impl PluginCategory for dyn EditConnectionCostPlugin { fn bundled_impl(name: &str) -> Option { match name { - "InhibitConnectionPlugin" => Some(Box::new(InhibitConnectionPlugin::default())), + "InhibitConnectionPlugin" => Some(Box::::default()), _ => None, } } diff --git a/sudachi/src/plugin/input_text/default_input_text/mod.rs b/sudachi/src/plugin/input_text/default_input_text/mod.rs index 3eba4bb7..069f7ff6 100644 --- a/sudachi/src/plugin/input_text/default_input_text/mod.rs +++ b/sudachi/src/plugin/input_text/default_input_text/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -77,7 +77,7 @@ impl DefaultInputTextPlugin { for (i, line) in reader.lines().enumerate() { let line = line?; let line = line.trim(); - if line.is_empty() || line.chars().next().unwrap() == '#' { + if line.is_empty() || line.starts_with('#') { continue; } let cols: Vec<_> = line.split_whitespace().collect(); @@ -103,7 +103,7 @@ impl DefaultInputTextPlugin { } let first_char = cols[0].chars().next().unwrap(); let n_char = cols[0].chars().count(); - if key_lengths.get(&first_char).map(|v| *v).unwrap_or(0) < n_char { + if key_lengths.get(&first_char).copied().unwrap_or(0) < n_char { key_lengths.insert(first_char, n_char); } replace_char_map.insert(cols[0].to_string(), cols[1].to_string()); @@ -200,10 +200,7 @@ impl DefaultInputTextPlugin { // 2. handle normalization let need_lowercase = ch.is_uppercase(); let need_nkfc = !self.should_ignore(ch) - && match is_nfkc_quick(std::iter::once(ch)) { - IsNormalized::Yes => false, - _ => true, - }; + && !matches!(is_nfkc_quick(std::iter::once(ch)), IsNormalized::Yes); // iterator types are incompatible, so calls can't be moved outside branches match (need_lowercase, need_nkfc) { @@ -237,14 +234,10 @@ impl DefaultInputTextPlugin { len: usize, ch: char, ) { - match data.next() { - Some(ch2) => { - if ch2 == ch { - return; - } + if let Some(ch2) = data.next() { + if ch2 != ch { replacer.replace_char_iter(start..start + len, ch2, data) } - None => return, } } } @@ -264,7 +257,7 @@ impl InputTextPlugin for DefaultInputTextPlugin { .unwrap_or_else(|| DEFAULT_REWRITE_DEF_FILE.into()), )?; - let reader = BufReader::new(fs::File::open(&rewrite_file_path)?); + let reader = BufReader::new(fs::File::open(rewrite_file_path)?); self.read_rewrite_lists(reader)?; Ok(()) @@ -280,11 +273,7 @@ impl InputTextPlugin for DefaultInputTextPlugin { edit: InputEditor<'a>, ) -> SudachiResult> { let chars = buffer.current_chars(); - let need_nkfc = match is_nfkc_quick(chars.iter().cloned()) { - IsNormalized::Yes => false, - _ => true, - }; - + let need_nkfc = !matches!(is_nfkc_quick(chars.iter().cloned()), IsNormalized::Yes); let need_lowercase = chars.iter().any(|c| c.is_uppercase()); if need_nkfc || need_lowercase { diff --git a/sudachi/src/plugin/input_text/mod.rs b/sudachi/src/plugin/input_text/mod.rs index b0aee380..67f00d91 100644 --- a/sudachi/src/plugin/input_text/mod.rs +++ b/sudachi/src/plugin/input_text/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -70,9 +70,9 @@ impl PluginCategory for dyn InputTextPlugin { fn bundled_impl(name: &str) -> Option { match name { - "IgnoreYomiganaPlugin" => Some(Box::new(IgnoreYomiganaPlugin::default())), - "DefaultInputTextPlugin" => Some(Box::new(DefaultInputTextPlugin::default())), - "ProlongedSoundMarkPlugin" => Some(Box::new(ProlongedSoundMarkPlugin::default())), + "IgnoreYomiganaPlugin" => Some(Box::::default()), + "DefaultInputTextPlugin" => Some(Box::::default()), + "ProlongedSoundMarkPlugin" => Some(Box::::default()), _ => None, } } diff --git a/sudachi/src/plugin/loader.rs b/sudachi/src/plugin/loader.rs index dfba8a1e..251b16fd 100644 --- a/sudachi/src/plugin/loader.rs +++ b/sudachi/src/plugin/loader.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -80,7 +80,7 @@ fn system_specific_name(s: &str) -> Option { let fname = p .file_name() .and_then(|np| np.to_str()) - .map(|f| make_system_specific_name(f)); + .map(make_system_specific_name); let parent = p.parent().and_then(|np| np.to_str()); match (parent, fname) { (Some(p), Some(c)) => Some(format!("{}/{}", p, c)), @@ -109,10 +109,10 @@ impl<'a, 'b, T: PluginCategory + ?Sized> PluginLoader<'a, 'b, T> { } pub fn freeze(self) -> PluginContainer { - return PluginContainer { + PluginContainer { libraries: self.libraries, plugins: self.plugins, - }; + } } fn load_plugin(&mut self, name: &str, plugin_cfg: &Value) -> SudachiResult<()> { @@ -132,7 +132,7 @@ impl<'a, 'b, T: PluginCategory + ?Sized> PluginLoader<'a, 'b, T> { self.load_plugin_from_dso(&candidates)? }; - ::do_setup(&mut plugin, plugin_cfg, &self.cfg, &mut self.grammar) + ::do_setup(&mut plugin, plugin_cfg, self.cfg, self.grammar) .map_err(|e| e.with_context(format!("plugin {} setup", name)))?; self.plugins.push(plugin); Ok(()) @@ -236,9 +236,9 @@ pub trait PluginCategory { /// Helper function to load the plugins of a single category /// Should be called with turbofish syntax and trait object type: /// `let plugins = load_plugins_of::(...)`. -pub fn load_plugins_of<'a, 'b, T: PluginCategory + ?Sized>( +pub fn load_plugins_of<'a, T: PluginCategory + ?Sized>( cfg: &'a Config, - grammar: &'a mut Grammar<'b>, + grammar: &'a mut Grammar<'_>, ) -> SudachiResult> { let mut loader: PluginLoader = PluginLoader::new(grammar, cfg); loader.load()?; diff --git a/sudachi/src/plugin/oov/mecab_oov/mod.rs b/sudachi/src/plugin/oov/mecab_oov/mod.rs index db0b6682..f083e804 100644 --- a/sudachi/src/plugin/oov/mecab_oov/mod.rs +++ b/sudachi/src/plugin/oov/mecab_oov/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ const DEFAULT_UNK_DEF_FILE: &str = "unk.def"; #[derive(Default)] pub struct MeCabOovPlugin { categories: HashMap, - oov_list: HashMap, RoMu>, + oov_list: HashMap, RoMu>, } /// Struct corresponds with raw config json file. @@ -70,7 +70,7 @@ impl MeCabOovPlugin { let line = line?; let line = line.trim(); if line.is_empty() - || line.chars().next().unwrap() == '#' + || line.starts_with('#') || line.chars().take(2).collect::>() == vec!['0', 'x'] { continue; @@ -118,18 +118,21 @@ impl MeCabOovPlugin { categories: &HashMap, mut grammar: &mut Grammar, user_pos: UserPosMode, - ) -> SudachiResult, RoMu>> { - let mut oov_list: HashMap, RoMu> = HashMap::with_hasher(RoMu::new()); + ) -> SudachiResult, RoMu>> { + let mut oov_list: HashMap, RoMu> = HashMap::with_hasher(RoMu::new()); for (i, line) in reader.lines().enumerate() { let line = line?; let line = line.trim(); - if line.is_empty() || line.chars().next().unwrap() == '#' { + if line.is_empty() || line.starts_with('#') { continue; } let cols: Vec<_> = line.split(',').collect(); - if cols.len() < 10 { - return Err(SudachiError::InvalidDataFormat(i, format!("{}", line))); + if cols.len() != 10 { + return Err(SudachiError::InvalidDataFormat( + i, + format!("Invalid number of columns ({})", line), + )); } let category_type: CategoryType = cols[0].parse()?; if !categories.contains_key(&category_type) { @@ -139,7 +142,7 @@ impl MeCabOovPlugin { )); } - let oov = OOV { + let oov = Oov { left_id: cols[1].parse()?, right_id: cols[2].parse()?, cost: cols[3].parse()?, @@ -182,7 +185,7 @@ impl MeCabOovPlugin { } /// Creates a new oov node - fn get_oov_node(&self, oov: &OOV, start: usize, end: usize) -> Node { + fn get_oov_node(&self, oov: &Oov, start: usize, end: usize) -> Node { Node::new( start as u16, end as u16, @@ -258,7 +261,7 @@ impl OovProviderPlugin for MeCabOovPlugin { .charDef .unwrap_or_else(|| PathBuf::from(DEFAULT_CHAR_DEF_FILE)), )?; - let reader = BufReader::new(fs::File::open(&char_def_path)?); + let reader = BufReader::new(fs::File::open(char_def_path)?); let categories = MeCabOovPlugin::read_character_property(reader)?; let unk_def_path = config.complete_path( @@ -266,7 +269,7 @@ impl OovProviderPlugin for MeCabOovPlugin { .unkDef .unwrap_or_else(|| PathBuf::from(DEFAULT_UNK_DEF_FILE)), )?; - let reader = BufReader::new(fs::File::open(&unk_def_path)?); + let reader = BufReader::new(fs::File::open(unk_def_path)?); let oov_list = MeCabOovPlugin::read_oov(reader, &categories, grammar, settings.userPOS)?; self.categories = categories; @@ -297,7 +300,7 @@ struct CategoryInfo { /// The OOV definition #[derive(Debug, Default, Clone)] -struct OOV { +struct Oov { left_id: i16, right_id: i16, cost: i16, diff --git a/sudachi/src/plugin/oov/mecab_oov/test.rs b/sudachi/src/plugin/oov/mecab_oov/test.rs index f46dc443..f5ca6550 100644 --- a/sudachi/src/plugin/oov/mecab_oov/test.rs +++ b/sudachi/src/plugin/oov/mecab_oov/test.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -437,7 +437,7 @@ fn read_oov_with_too_few_columns() { &mut grammar, UserPosMode::Forbid, ); - assert_matches!(result, Err(SudachiError::InvalidDataFormat(0, s)) if s == data); + assert_matches!(result, Err(SudachiError::InvalidDataFormat(0, s)) if s.contains(data)); } #[test] @@ -492,13 +492,13 @@ fn read_oov_with_category_not_in_character_property() { fn build_plugin() -> MeCabOovPlugin { let mut plugin = MeCabOovPlugin::default(); - let oov1 = OOV { + let oov1 = Oov { right_id: -1, left_id: -1, cost: -1, pos_id: 1, }; - let oov2 = OOV { + let oov2 = Oov { right_id: -1, left_id: -1, cost: -1, diff --git a/sudachi/src/plugin/oov/mod.rs b/sudachi/src/plugin/oov/mod.rs index 847f7c23..986fe8a0 100644 --- a/sudachi/src/plugin/oov/mod.rs +++ b/sudachi/src/plugin/oov/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,9 +61,9 @@ impl PluginCategory for dyn OovProviderPlugin { fn bundled_impl(name: &str) -> Option { match name { - "SimpleOovPlugin" => Some(Box::new(SimpleOovPlugin::default())), - "MeCabOovPlugin" => Some(Box::new(MeCabOovPlugin::default())), - "RegexOovProvider" => Some(Box::new(RegexOovProvider::default())), + "SimpleOovPlugin" => Some(Box::::default()), + "MeCabOovPlugin" => Some(Box::::default()), + "RegexOovProvider" => Some(Box::::default()), _ => None, } } diff --git a/sudachi/src/plugin/oov/regex_oov/mod.rs b/sudachi/src/plugin/oov/regex_oov/mod.rs index ec0b68f4..c9628d3b 100644 --- a/sudachi/src/plugin/oov/regex_oov/mod.rs +++ b/sudachi/src/plugin/oov/regex_oov/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022 Works Applications Co., Ltd. + * Copyright (c) 2022-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,19 +44,14 @@ pub(crate) struct RegexOovProvider { boundaries: BoundaryMode, } -#[derive(Deserialize, Eq, PartialEq, Debug, Copy, Clone)] +#[derive(Deserialize, Eq, PartialEq, Debug, Default, Copy, Clone)] #[serde(rename_all = "lowercase")] pub enum BoundaryMode { + #[default] Strict, Relaxed, } -impl Default for BoundaryMode { - fn default() -> Self { - BoundaryMode::Strict - } -} - fn default_max_length() -> usize { 32 } @@ -89,7 +84,7 @@ impl OovProviderPlugin for RegexOovProvider { ) -> SudachiResult<()> { let mut parsed: RegexProviderConfig = serde_json::from_value(settings.clone())?; - if !parsed.regex.starts_with("^") { + if !parsed.regex.starts_with('^') { parsed.regex.insert(0, '^'); } diff --git a/sudachi/src/plugin/path_rewrite/join_katakana_oov/tests.rs b/sudachi/src/plugin/path_rewrite/join_katakana_oov/tests.rs index afaac93b..0d565b31 100644 --- a/sudachi/src/plugin/path_rewrite/join_katakana_oov/tests.rs +++ b/sudachi/src/plugin/path_rewrite/join_katakana_oov/tests.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -105,8 +105,10 @@ fn start_with_tail() { #[test] fn with_noovbow() { - let mut plugin = JoinKatakanaOovPlugin::default(); - plugin.min_length = 3; + let plugin = JoinKatakanaOovPlugin { + min_length: 3, + ..Default::default() + }; let text = build_text("ァアイアイウ"); diff --git a/sudachi/src/plugin/path_rewrite/join_numeric/mod.rs b/sudachi/src/plugin/path_rewrite/join_numeric/mod.rs index 730b492b..d014781e 100644 --- a/sudachi/src/plugin/path_rewrite/join_numeric/mod.rs +++ b/sudachi/src/plugin/path_rewrite/join_numeric/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -102,10 +102,10 @@ impl JoinNumericPlugin { for c in s.chars() { if !parser.append(&c) { if begin_idx >= 0 { - if parser.error_state == numeric_parser::Error::COMMA { + if parser.error_state == numeric_parser::Error::Comma { comma_as_digit = false; i = begin_idx - 1; - } else if parser.error_state == numeric_parser::Error::POINT { + } else if parser.error_state == numeric_parser::Error::Point { period_as_digit = false; i = begin_idx - 1; } @@ -132,8 +132,8 @@ impl JoinNumericPlugin { i = begin_idx + 1; } else { let ss = path[i as usize - 1].word_info().normalized_form(); - if (parser.error_state == numeric_parser::Error::COMMA && ss == ",") - || (parser.error_state == numeric_parser::Error::POINT && ss == ".") + if (parser.error_state == numeric_parser::Error::Comma && ss == ",") + || (parser.error_state == numeric_parser::Error::Point && ss == ".") { path = self.concat(path, begin_idx as usize, i as usize - 1, &mut parser)?; @@ -157,8 +157,8 @@ impl JoinNumericPlugin { path = self.concat(path, begin_idx as usize, len, &mut parser)?; } else { let ss = path[len - 1].word_info().normalized_form(); - if (parser.error_state == numeric_parser::Error::COMMA && ss == ",") - || (parser.error_state == numeric_parser::Error::POINT && ss == ".") + if (parser.error_state == numeric_parser::Error::Comma && ss == ",") + || (parser.error_state == numeric_parser::Error::Point && ss == ".") { path = self.concat(path, begin_idx as usize, len - 1, &mut parser)?; } diff --git a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs index 30e61e1f..da2e87cc 100644 --- a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs +++ b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,10 +24,10 @@ use string_number::StringNumber; /// State of the parser #[derive(Debug, Eq, PartialEq)] pub enum Error { - NONE, - POINT, - COMMA, - // OTHER, + None, + Point, + Comma, + // Other, } /// Parses number written by arabic or kanji @@ -83,7 +83,7 @@ impl NumericParser { is_first_digit: true, has_comma: false, has_hanging_point: false, - error_state: Error::NONE, + error_state: Error::None, total: StringNumber::new(), subtotal: StringNumber::new(), tmp: StringNumber::new(), @@ -95,7 +95,7 @@ impl NumericParser { self.is_first_digit = true; self.has_comma = false; self.has_hanging_point = false; - self.error_state = Error::NONE; + self.error_state = Error::None; self.total.clear(); self.subtotal.clear(); self.tmp.clear(); @@ -105,15 +105,15 @@ impl NumericParser { if *c == '.' { self.has_hanging_point = true; if self.is_first_digit { - self.error_state = Error::POINT; + self.error_state = Error::Point; return false; } if self.has_comma && !self.check_comma() { - self.error_state = Error::COMMA; + self.error_state = Error::Comma; return false; } if !self.tmp.set_point() { - self.error_state = Error::POINT; + self.error_state = Error::Point; return false; } self.has_comma = false; @@ -121,7 +121,7 @@ impl NumericParser { } if *c == ',' { if !self.check_comma() { - self.error_state = Error::COMMA; + self.error_state = Error::Comma; return false; } self.has_comma = true; @@ -168,18 +168,18 @@ impl NumericParser { pub fn done(&mut self) -> bool { let ret = self.subtotal.add(&mut self.tmp) && self.total.add(&mut self.subtotal); if self.has_hanging_point { - self.error_state = Error::POINT; + self.error_state = Error::Point; return false; } if self.has_comma && self.digit_length != 3 { - self.error_state = Error::COMMA; + self.error_state = Error::Comma; return false; } ret } pub fn get_normalized(&mut self) -> String { - self.total.to_string() + self.total.get_normalized() } fn check_comma(&self) -> bool { @@ -193,7 +193,7 @@ impl NumericParser { } fn is_small_unit(n: i32) -> bool { - -3 <= n && n < 0 + (-3..0).contains(&n) } fn is_large_unit(n: i32) -> bool { n < -3 @@ -327,23 +327,23 @@ mod tests { parser.clear(); assert!(!parse(&mut parser, "200,00,000")); - assert_eq!(Error::COMMA, parser.error_state); + assert_eq!(Error::Comma, parser.error_state); parser.clear(); assert!(!parse(&mut parser, "2,4")); - assert_eq!(Error::COMMA, parser.error_state); + assert_eq!(Error::Comma, parser.error_state); parser.clear(); assert!(!parse(&mut parser, "000,000")); - assert_eq!(Error::COMMA, parser.error_state); + assert_eq!(Error::Comma, parser.error_state); parser.clear(); assert!(!parse(&mut parser, ",000")); - assert_eq!(Error::COMMA, parser.error_state); + assert_eq!(Error::Comma, parser.error_state); parser.clear(); assert!(!parse(&mut parser, "256,55.1")); - assert_eq!(Error::COMMA, parser.error_state); + assert_eq!(Error::Comma, parser.error_state); } #[test] @@ -360,10 +360,10 @@ mod tests { parser.clear(); assert!(!parse(&mut parser, "6.")); - assert_eq!(Error::POINT, parser.error_state); + assert_eq!(Error::Point, parser.error_state); parser.clear(); assert!(!parse(&mut parser, "1.2.3")); - assert_eq!(Error::POINT, parser.error_state); + assert_eq!(Error::Point, parser.error_state); } } diff --git a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs index 5c8d0ea4..7e03ffe7 100644 --- a/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs +++ b/sudachi/src/plugin/path_rewrite/join_numeric/numeric_parser/string_number.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -97,10 +97,10 @@ impl StringNumber { } pub fn is_zero(&self) -> bool { - self.significand.len() == 0 + self.significand.is_empty() } - pub fn to_string(&mut self) -> String { + pub fn get_normalized(&mut self) -> String { if self.is_zero() { return "0".to_owned(); } @@ -121,7 +121,7 @@ impl StringNumber { .count(); self.significand .truncate(self.significand.len() - n_last_zero); - if self.significand.chars().last().unwrap() == '.' { + if self.significand.ends_with('.') { self.significand.truncate(self.significand.len() - 1); } } diff --git a/sudachi/src/plugin/path_rewrite/mod.rs b/sudachi/src/plugin/path_rewrite/mod.rs index 64c86d4e..df0d2919 100644 --- a/sudachi/src/plugin/path_rewrite/mod.rs +++ b/sudachi/src/plugin/path_rewrite/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,8 +53,8 @@ impl PluginCategory for dyn PathRewritePlugin { fn bundled_impl(name: &str) -> Option { match name { - "JoinNumericPlugin" => Some(Box::new(JoinNumericPlugin::default())), - "JoinKatakanaOovPlugin" => Some(Box::new(JoinKatakanaOovPlugin::default())), + "JoinNumericPlugin" => Some(Box::::default()), + "JoinKatakanaOovPlugin" => Some(Box::::default()), _ => None, } } diff --git a/sudachi/src/sentence_detector.rs b/sudachi/src/sentence_detector.rs index 805b1888..29285c76 100644 --- a/sudachi/src/sentence_detector.rs +++ b/sudachi/src/sentence_detector.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -76,6 +76,12 @@ pub struct SentenceDetector { limit: usize, } +impl Default for SentenceDetector { + fn default() -> Self { + Self::new() + } +} + impl SentenceDetector { pub fn new() -> Self { SentenceDetector { @@ -175,13 +181,13 @@ fn parenthesis_level(s: &str) -> SudachiResult { )) .unwrap(); } - let mut level = 0; + let mut level: usize = 0; for caps in PARENTHESIS.captures_iter(s) { - if let Some(_) = caps?.get(1) { + if caps?.get(1).is_some() { // open level += 1; - } else if level > 0 { - level -= 1; + } else { + level = level.saturating_sub(1); } } Ok(level) @@ -225,7 +231,7 @@ fn is_continuous_phrase(s: &str, eos: usize) -> SudachiResult { } // we can safely unwrap since eos < s.len() - let c = s[eos..].chars().nth(0).unwrap(); + let c = s[eos..].chars().next().unwrap(); Ok((c == 'と' || c == 'や' || c == 'の') && EOS_ITEMIZE_HEADER.is_match(&s[..eos])?) } diff --git a/sudachi/src/sentence_splitter.rs b/sudachi/src/sentence_splitter.rs index 5fa77b7c..cd465951 100644 --- a/sudachi/src/sentence_splitter.rs +++ b/sudachi/src/sentence_splitter.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,6 +56,12 @@ pub struct SentenceSplitter<'a> { checker: Option>, } +impl Default for SentenceSplitter<'_> { + fn default() -> Self { + Self::new() + } +} + impl SentenceSplitter<'_> { pub fn new() -> Self { SentenceSplitter { diff --git a/sudachi/src/util/check_params.rs b/sudachi/src/util/check_params.rs index aeab14e5..80ca1888 100644 --- a/sudachi/src/util/check_params.rs +++ b/sudachi/src/util/check_params.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -39,7 +39,7 @@ impl<'a> CheckParams for Grammar<'a> { format!("max grammar leftId is {}", self.conn_matrix().num_left()), )); } - return Ok(x as u16); + Ok(x as u16) } fn check_right_id>(&self, raw: T) -> SudachiResult { @@ -57,7 +57,7 @@ impl<'a> CheckParams for Grammar<'a> { format!("max grammar rightId is {}", self.conn_matrix().num_right()), )); } - return Ok(x as u16); + Ok(x as u16) } fn check_cost>(&self, raw: T) -> SudachiResult { @@ -82,6 +82,6 @@ impl<'a> CheckParams for Grammar<'a> { ), )); } - return Ok(x as i16); + Ok(x as i16) } } diff --git a/sudachi/src/util/cow_array.rs b/sudachi/src/util/cow_array.rs index 73d8556a..c25ba025 100644 --- a/sudachi/src/util/cow_array.rs +++ b/sudachi/src/util/cow_array.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,7 +26,7 @@ pub trait ReadLE { impl ReadLE for i16 { fn from_le_bytes(bytes: &[u8]) -> Result { - bytes.try_into().map(|b| Self::from_le_bytes(b)) + bytes.try_into().map(Self::from_le_bytes) } } @@ -35,7 +35,7 @@ impl ReadLE for u32 { where Self: Sized, { - bytes.try_into().map(|b| Self::from_le_bytes(b)) + bytes.try_into().map(Self::from_le_bytes) } } @@ -111,7 +111,9 @@ impl<'a, T: ReadLE + Clone> CowArray<'a, T> { let slice: &[T] = self.storage.as_ref().unwrap().as_slice(); self.slice = unsafe { std::mem::transmute(slice) }; } - self.storage.as_mut().map(|s| s[offset] = value); + if let Some(s) = self.storage.as_mut() { + s[offset] = value + } } } diff --git a/sudachi/src/util/testing.rs b/sudachi/src/util/testing.rs index c1c4b860..520919ff 100644 --- a/sudachi/src/util/testing.rs +++ b/sudachi/src/util/testing.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ pub fn char_cats() -> CharacterCategory { pub fn build_mock_bytes() -> Vec { let mut buf = Vec::new(); // encode pos for oov - buf.extend(&(1 as i16).to_le_bytes()); + buf.extend(&1_i16.to_le_bytes()); let pos = vec!["補助記号", "一般", "*", "*", "*", "*"]; for s in pos { let utf16: Vec<_> = s.encode_utf16().collect(); @@ -42,8 +42,8 @@ pub fn build_mock_bytes() -> Vec { } } // set 10 for left and right id sizes - buf.extend(&(10 as i16).to_le_bytes()); - buf.extend(&(10 as i16).to_le_bytes()); + buf.extend(&10_i16.to_le_bytes()); + buf.extend(&10_i16.to_le_bytes()); for i in 0..10 { for j in 0..10 { let val = i * 100 + j; diff --git a/sudachi/src/util/user_pos.rs b/sudachi/src/util/user_pos.rs index 7290e07a..06a2f332 100644 --- a/sudachi/src/util/user_pos.rs +++ b/sudachi/src/util/user_pos.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,17 +20,12 @@ use itertools::Itertools; use serde::Deserialize; use std::fmt::Display; -#[derive(Eq, PartialEq, Deserialize, Clone, Copy, Debug)] +#[derive(Eq, PartialEq, Deserialize, Clone, Copy, Debug, Default)] #[serde(rename_all = "lowercase")] pub enum UserPosMode { - Allow, + #[default] Forbid, -} - -impl Default for UserPosMode { - fn default() -> Self { - UserPosMode::Forbid - } + Allow, } pub trait UserPosSupport { diff --git a/sudachi/tests/common/mod.rs b/sudachi/tests/common/mod.rs index 430dd462..969d618d 100644 --- a/sudachi/tests/common/mod.rs +++ b/sudachi/tests/common/mod.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -38,7 +38,7 @@ use sudachi::dic::subset::InfoSubset; pub fn dictionary_bytes_from_path>(dictionary_path: P) -> SudachiResult> { let dictionary_path = dictionary_path.as_ref(); - let dictionary_stat = fs::metadata(&dictionary_path)?; + let dictionary_stat = fs::metadata(dictionary_path)?; let mut dictionary_file = File::open(dictionary_path)?; let mut dictionary_bytes = Vec::with_capacity(dictionary_stat.len() as usize); dictionary_file.read_to_end(&mut dictionary_bytes)?; @@ -59,9 +59,7 @@ lazy_static! { .resolved_system_dict() .expect("system dict failure"); - let dictionary_bytes = dictionary_bytes_from_path(dictionary_path) - .expect("Failed to read dictionary from path"); - dictionary_bytes + dictionary_bytes_from_path(dictionary_path).expect("Failed to read dictionary from path") }; static ref USER_DICTIONARY_BYTES: Vec> = { let mut bytes = Vec::with_capacity(TEST_CONFIG.user_dicts.len()); @@ -98,7 +96,7 @@ impl TestTokenizer { pub fn new() -> TestTokenizer { let dict = JapaneseDictionary::from_cfg(&TEST_CONFIG).expect("failed to make dictionary"); let tok = StatelessTokenizer::new(Arc::new(dict)); - return TestTokenizer { tok }; + TestTokenizer { tok } } pub fn tokenize<'a>( @@ -111,7 +109,7 @@ impl TestTokenizer { } pub fn dict(&self) -> &JapaneseDictionary { - &self.tok.as_dict() + self.tok.as_dict() } } diff --git a/sudachi/tests/morpheme.rs b/sudachi/tests/morpheme.rs index c13624d4..2fdb1937 100644 --- a/sudachi/tests/morpheme.rs +++ b/sudachi/tests/morpheme.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,7 +51,7 @@ fn morpheme_attributes() { assert_eq!("京都", ms.get(0).normalized_form()); assert_eq!("キョウト", ms.get(0).reading_form()); - assert_eq!(false, ms.get(0).is_oov()); + assert!(!ms.get(0).is_oov()); assert_eq!(3, ms.get(0).word_id().word()); assert_eq!(0, ms.get(0).dictionary_id()); diff --git a/sudachi/tests/regex_oov.rs b/sudachi/tests/regex_oov.rs index 0d21f855..a1956153 100644 --- a/sudachi/tests/regex_oov.rs +++ b/sudachi/tests/regex_oov.rs @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021 Works Applications Co., Ltd. + * Copyright (c) 2021-2024 Works Applications Co., Ltd. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ use std::ops::Deref; mod common; -const REGEX_CONFIG: &'static [u8] = include_bytes!("resources/sudachi.regex.json"); +const REGEX_CONFIG: &[u8] = include_bytes!("resources/sudachi.regex.json"); #[test] fn no_other_words() {