Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions python/src/word_info.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -33,9 +33,9 @@ pub struct PyWordInfo {
synonym_group_ids: Vec<u32>,
}

fn copy_if_empty(v1: String, v2: &String) -> String {
fn copy_if_empty(v1: String, v2: &str) -> String {
if v1.is_empty() {
v2.clone()
v2.to_owned()
} else {
v1
}
Expand Down
22 changes: 12 additions & 10 deletions sudachi-cli/src/analysis.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -51,14 +51,14 @@ impl<'a> Analysis for SplitSentencesOnly<'a> {
}
}

pub struct AnalyzeNonSplitted<D: DictionaryAccess, O: SudachiOutput<D>> {
output: O,
pub struct AnalyzeNonSplitted<D: DictionaryAccess, O: SudachiOutput<D> + ?Sized> {
output: Box<O>,
analyzer: StatefulTokenizer<D>,
morphemes: MorphemeList<D>,
}

impl<D: DictionaryAccess + Clone, O: SudachiOutput<D>> AnalyzeNonSplitted<D, O> {
pub fn new(output: O, dict: D, mode: Mode, enable_debug: bool) -> Self {
impl<D: DictionaryAccess + Clone, O: SudachiOutput<D> + ?Sized> AnalyzeNonSplitted<D, O> {
pub fn new(output: Box<O>, dict: D, mode: Mode, enable_debug: bool) -> Self {
Self {
output,
morphemes: MorphemeList::empty(dict.clone()),
Expand All @@ -67,7 +67,7 @@ impl<D: DictionaryAccess + Clone, O: SudachiOutput<D>> AnalyzeNonSplitted<D, O>
}
}

impl<D: DictionaryAccess, O: SudachiOutput<D>> Analysis for AnalyzeNonSplitted<D, O> {
impl<D: DictionaryAccess, O: SudachiOutput<D> + ?Sized> Analysis for AnalyzeNonSplitted<D, O> {
fn analyze(&mut self, input: &str, writer: &mut Writer) {
self.analyzer.reset().push_str(input);
self.analyzer
Expand All @@ -86,21 +86,23 @@ impl<D: DictionaryAccess, O: SudachiOutput<D>> Analysis for AnalyzeNonSplitted<D
}
}

pub struct AnalyzeSplitted<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> {
pub struct AnalyzeSplitted<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> {
splitter: SentenceSplitter<'a>,
inner: AnalyzeNonSplitted<&'a D, O>,
}

impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> AnalyzeSplitted<'a, D, O> {
pub fn new(output: O, dict: &'a D, mode: Mode, enable_debug: bool) -> Self {
impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> AnalyzeSplitted<'a, D, O> {
pub fn new(output: Box<O>, dict: &'a D, mode: Mode, enable_debug: bool) -> Self {
Self {
inner: AnalyzeNonSplitted::new(output, dict, mode, enable_debug),
splitter: SentenceSplitter::new().with_checker(dict.lexicon()),
}
}
}

impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D>> Analysis for AnalyzeSplitted<'a, D, O> {
impl<'a, D: DictionaryAccess + 'a, O: SudachiOutput<&'a D> + ?Sized> Analysis
for AnalyzeSplitted<'a, D, O>
{
fn analyze(&mut self, input: &str, writer: &mut Writer) {
for (_, sent) in self.splitter.split(input) {
self.inner.analyze(sent, writer);
Expand Down
54 changes: 31 additions & 23 deletions sudachi-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,12 @@ use clap::Parser;

use crate::analysis::{Analysis, AnalyzeNonSplitted, AnalyzeSplitted, SplitSentencesOnly};
use crate::build::{build_main, is_build_mode, BuildCli};
use sudachi::analysis::stateless_tokenizer::DictionaryAccess;
use sudachi::config::Config;
use sudachi::dic::dictionary::JapaneseDictionary;
use sudachi::prelude::*;

#[cfg(feature = "bake_dictionary")]
const BAKED_DICTIONARY_BYTES: &[u8] = include_bytes!(env!("SUDACHI_DICT_PATH"));

#[derive(Clone, Debug, Eq, PartialEq, Default)]
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
pub enum SentenceSplitMode {
/// Do both sentence splitting and analysis
#[default]
Expand Down Expand Up @@ -117,16 +115,33 @@ struct Cli {
command: Option<BuildCli>,
}

// want to instantiate a different type for different output format
// this takes a f as a function which will be created with a different actual type
macro_rules! with_output {
($cli: expr, $f: expr) => {
if $cli.wakati {
Box::new($f(output::Wakachi::default()))
} else {
Box::new($f(output::Simple::new($cli.print_all)))
}
};
pub fn setup_output<D: DictionaryAccess>(
wakachi: bool,
print_all: bool,
) -> Box<dyn output::SudachiOutput<D>> {
if wakachi {
Box::new(output::Wakachi::default())
} else {
Box::new(output::Simple::new(print_all))
}
}

fn setup_analyzer<'a>(args: &Cli, dict: &'a impl DictionaryAccess) -> Box<dyn Analysis + 'a> {
match args.split_sentences {
SentenceSplitMode::Only => Box::new(SplitSentencesOnly::new(dict)),
SentenceSplitMode::Default => Box::new(AnalyzeSplitted::new(
setup_output(args.wakati, args.print_all),
dict,
args.mode,
args.enable_debug,
)),
SentenceSplitMode::None => Box::new(AnalyzeNonSplitted::new(
setup_output(args.wakati, args.print_all),
dict,
args.mode,
args.enable_debug,
)),
}
}

fn main() {
Expand Down Expand Up @@ -169,15 +184,7 @@ fn main() {
let dict = JapaneseDictionary::from_cfg(&config)
.unwrap_or_else(|e| panic!("Failed to create dictionary: {:?}", e));

let mut analyzer: Box<dyn Analysis> = match args.split_sentences {
SentenceSplitMode::Only => Box::new(SplitSentencesOnly::new(&dict)),
SentenceSplitMode::Default => with_output!(args, |o| {
AnalyzeSplitted::new(o, &dict, args.mode, args.enable_debug)
}),
SentenceSplitMode::None => with_output!(args, |o| {
AnalyzeNonSplitted::new(o, &dict, args.mode, args.enable_debug)
}),
};
let mut analyzer: Box<dyn Analysis> = setup_analyzer(&args, &dict);

let mut data = String::with_capacity(4 * 1024);
let is_stdout = args.output_file.is_none();
Expand Down Expand Up @@ -213,6 +220,7 @@ fn strip_eol(data: &str) -> &str {
// Safety: str was correct and we only removed full characters
unsafe { std::str::from_utf8_unchecked(bytes) }
}

#[cfg(test)]
mod tests {
use clap::CommandFactory;
Expand Down
2 changes: 1 addition & 1 deletion sudachi/src/analysis/stateless_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ pub(super) fn split_path<T: DictionaryAccess + ?Sized>(
Ok(new_path)
}

pub(super) fn dump_path(path: &Vec<ResultNode>) {
pub(super) fn dump_path(path: &[ResultNode]) {
for (i, node) in path.iter().enumerate() {
println!("{}: {}", i, node);
}
Expand Down
2 changes: 1 addition & 1 deletion sudachi/src/dic/build/lexicon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ impl StrPosEntry {
Self { data: owned }
}

pub fn from_built_pos(data: &Vec<String>) -> Self {
pub fn from_built_pos(data: &[String]) -> Self {
let mut iter = data.iter().map(|x| x.as_str());
let p1 = Cow::Borrowed(iter.next().unwrap());
let p2 = Cow::Borrowed(iter.next().unwrap());
Expand Down
32 changes: 1 addition & 31 deletions sudachi/src/dic/lexicon/word_infos.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -14,9 +14,6 @@
* limitations under the License.
*/

use std::iter::FusedIterator;

use crate::dic::lexicon_set::LexiconSet;
use crate::dic::read::u32_parser;
use crate::dic::read::word_info::WordInfoParser;
use crate::dic::subset::InfoSubset;
Expand Down Expand Up @@ -177,30 +174,3 @@ impl From<WordInfo> for WordInfoData {
info.data
}
}

struct SplitIter<'a> {
index: usize,
split: &'a [WordId],
lexicon: &'a LexiconSet<'a>,
}

impl Iterator for SplitIter<'_> {
type Item = SudachiResult<WordInfo>;

fn next(&mut self) -> Option<Self::Item> {
let idx = self.index;
if idx >= self.split.len() {
None
} else {
self.index += 1;
Some(self.lexicon.get_word_info(self.split[idx]))
}
}

fn size_hint(&self) -> (usize, Option<usize>) {
let rem = self.split.len() - self.index;
(rem, Some(rem))
}
}

impl FusedIterator for SplitIter<'_> {}
2 changes: 1 addition & 1 deletion sudachi/src/dic/lexicon_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ impl LexiconSet<'_> {
self.lexicons[dic_id].get_word_param(id.word())
}

fn update_dict_id(split: &mut Vec<WordId>, dict_id: u8) -> SudachiResult<()> {
fn update_dict_id(split: &mut [WordId], dict_id: u8) -> SudachiResult<()> {
for id in split.iter_mut() {
let cur_dict_id = id.dic();
if cur_dict_id > 0 {
Expand Down
13 changes: 9 additions & 4 deletions sudachi/src/error.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,7 +15,6 @@
*/

use std::fmt::Debug;
use std::io::Error;
use thiserror::Error;

use crate::config::ConfigError;
Expand Down Expand Up @@ -50,7 +49,7 @@ pub enum SudachiError {
FromUtf16(#[from] std::string::FromUtf16Error),

#[error("Regex error")]
RegexError(#[from] fancy_regex::Error),
RegexError { cause: Box<fancy_regex::Error> },

#[error("Error from nom {0}")]
NomParseError(String),
Expand Down Expand Up @@ -108,14 +107,20 @@ pub enum SudachiError {
}

impl From<std::io::Error> for SudachiError {
fn from(e: Error) -> Self {
fn from(e: std::io::Error) -> Self {
SudachiError::Io {
cause: e,
context: String::from("IO Error"),
}
}
}

impl From<fancy_regex::Error> for SudachiError {
fn from(e: fancy_regex::Error) -> Self {
SudachiError::RegexError { cause: Box::new(e) }
}
}

impl SudachiError {
pub fn with_context<S: Into<String>>(self, ctx: S) -> Self {
match self {
Expand Down
6 changes: 3 additions & 3 deletions sudachi/src/input_text/buffer/edit.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -89,7 +89,7 @@ impl<'a> InputEditor<'a> {
// Current plugin implementations satisfy this criteria.
pub fn resolve_edits(
source: &str,
source_mapping: &Vec<usize>,
source_mapping: &[usize],
target: &mut String,
target_mapping: &mut Vec<usize>,
edits: &mut Vec<ReplaceOp>,
Expand Down Expand Up @@ -127,7 +127,7 @@ pub fn resolve_edits(
}

fn add_replace(
source_mapping: &Vec<usize>,
source_mapping: &[usize],
target: &mut String,
target_mapping: &mut Vec<usize>,
what: Range<usize>,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ impl NumericParser {
}

pub fn get_normalized(&mut self) -> String {
self.total.to_string()
self.total.get_normalized()
}

fn check_comma(&self) -> bool {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ impl StringNumber {
self.significand.is_empty()
}

pub fn to_string(&mut self) -> String {
pub fn get_normalized(&mut self) -> String {
if self.is_zero() {
return "0".to_owned();
}
Expand Down