Skip to content
Closed

CLI #48

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
311 changes: 296 additions & 15 deletions Cargo.lock

Large diffs are not rendered by default.

4 changes: 1 addition & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
[workspace]
resolver = "2"
members = [
"yake_rust"
]
members = ["yake", "yake_rust"]

[workspace.dependencies]
rusty-hook = "0.11.2"
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,3 +92,33 @@ Results:
| learning | learning | 0.1621 |
| goldbloom | Goldbloom | 0.1625 |
| machine | machine | 0.1672 |


### CLI
`yake` is the CLI implementation of `yake_rust`

#### Basic usage
```shell
$ cargo install --path yake
$ yake --input-file yake_rust/src/test_google.txt
```
#### More options

```shell
$ yake --help

Usage: yake [OPTIONS] <--text-input <TEXT>|--input-file <FILE>>

Options:
--text-input <TEXT> Input text, SURROUNDED by single quotes(')
-i, --input-file <FILE> Input file
-n, --ngram-size <INTEGER> Max size of the ngram [default: 3]
--dedup-lim <FLOAT> Deduplication limiter [default: 0.9]
--window-size <INTEGER> Window size [default: 1]
-t, --top <INTEGER> Number of keyphrases to extract
-v, --verbose Gets detailed information (such as the score)
-l, --language <TEXT> Language [default: en]
--json Dump output as JSON
-h, --help Print help
-V, --version Print version
```
11 changes: 11 additions & 0 deletions yake/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "yake"
version = "0.1.0"
edition = "2021"

[dependencies]
yake-rust = { path = "../yake_rust" , features = ["serde"]}
clap = { version = "4.5.26", features = ["cargo", "derive", "string"] }
serde_json = "1.0.135"
exit-code = "1.0.0"
prettytable-rs = "0.10.0"
134 changes: 134 additions & 0 deletions yake/src/cli.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
use std::{path::PathBuf, sync::LazyLock};

use clap::error::ErrorKind;
use clap::{command, Args};
use clap::{CommandFactory, Parser};
use yake_rust::{Config, StopWords};

static DEFAULT_CONFIG: LazyLock<Config> = LazyLock::new(Config::default);

#[derive(Args)]
#[group(required = true, multiple = false)]
struct Input {
// -ti, --text_input TEXT
/// Input text
#[arg(
conflicts_with = "input_file",
long,
help = "Input text, SURROUNDED by single quotes(')",
value_name = "TEXT"
)]
text_input: Option<String>,

// -i, --input_file TEXT
/// Input file
#[arg(conflicts_with = "text_input", short, long, help = "Input file", value_name = "FILE")]
input_file: Option<PathBuf>,
}

// TODO
// -df, --dedup-func [leve|jaro|seqm]
// Deduplication function.

#[derive(Parser)]
#[command(version, about, long_about = None)]
struct Cli {
#[command(flatten)]
input: Input,

// -n, --ngram-size INTEGER
/// Max size of the ngram
#[arg(short, long, default_value_t = DEFAULT_CONFIG.ngrams, help = "Max size of the ngram", value_name = "INTEGER")]
ngram_size: usize,

// -dl, --dedup-lim FLOAT
/// Deduplication limiter
#[arg(long, value_parser = parse_dedup, default_value_t = DEFAULT_CONFIG.deduplication_threshold, help = "Deduplication limiter", value_name = "FLOAT")]
dedup_lim: f64,

// -ws, --window-size INTEGER
/// Window size
#[arg(long, default_value_t = DEFAULT_CONFIG.window_size, help = "Window size", value_name = "INTEGER")]
window_size: usize,

// -t, --top INTEGER
/// Number of keyphrases to extract
#[arg(short, long, help = "Number of keyphrases to extract", value_name = "INTEGER")]
top: Option<usize>,

// -v, --verbose
/// Gets detailed information (such as the score)
#[arg(short, long, help = "Gets detailed information (such as the score)")]
verbose: bool,

// // --help
// /// Show this message and exit
// #[arg(short, long)]
// help: bool,

// -l, --language TEXT
/// Language
#[arg(short, long, default_value= "en", value_parser = parse_language, help = "Language", value_name = "TEXT")]
language: StopWords,

#[arg(long, help = "Dump output as JSON")]
json: bool,
}

fn parse_language(cli_language: &str) -> Result<StopWords, String> {
StopWords::predefined(cli_language)
.ok_or_else(|| format!("Could not find language {}, did you enable this feature?", cli_language))
}

fn parse_dedup(cli_dedup_lim: &str) -> Result<f64, String> {
match cli_dedup_lim.parse::<f64>() {
Ok(value @ 0f64..=1f64) => Ok(value),
Ok(value) => Err(format!("{} is not in the 0..=1", value)),
Err(_) => Err("invalid digit found in string".into()),
}
}

pub struct ParsedCli {
pub config: Config,
pub language: StopWords,
pub input: String,
pub json: bool,
pub top: Option<usize>,
pub verbose: bool,
}

pub fn parse_cli() -> ParsedCli {
let cli = Cli::parse();

let input = match (cli.input.text_input, cli.input.input_file) {
(None, None) | (Some(_), Some(_)) => {
panic!("clap should ensure that either text-input or input-file is specified")
}
(None, Some(path_to_file)) => match std::fs::read_to_string(&path_to_file) {
Ok(text) => text,
Err(err) => {
Cli::command()
.error(
ErrorKind::ValueValidation,
format!("Error reading file `{}`: {:?}", path_to_file.display(), err),
)
.exit();
}
},
(Some(text), None) => text,
};

ParsedCli {
config: Config {
ngrams: cli.ngram_size,
window_size: cli.window_size,
deduplication_threshold: cli.dedup_lim,
..Config::default()
},
language: cli.language,
input,
json: cli.json,
verbose: cli.verbose,
top: cli.top,
}
}
64 changes: 64 additions & 0 deletions yake/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
use cli::{parse_cli, ParsedCli};
use prettytable::{format, row, Table};
use yake_rust::{ResultItem, Yake};

mod cli;

fn main() {
let ParsedCli { language, json, input, config, top, verbose } = parse_cli();

let now = std::time::Instant::now();

let keywords = Yake::new(language, config).get_n_best(&input, top);

output_keywords(&keywords, json, verbose);
if verbose {
eprintln!("Elapsed: {:.2?}", now.elapsed());
}
}

fn output_keywords(keywords: &[ResultItem], json: bool, verbose: bool) {
match (json, verbose) {
(true, _) => {
output_keywords_json(&keywords);
}
(false, true) => {
output_keywords_verbose(&keywords);
}
(false, false) => {
output_keywords_simple(&keywords);
}
}
}

fn output_keywords_verbose(keywords: &[ResultItem]) {
let mut table = Table::new();
table.set_titles(row!["keyword", "raw", "score"]);
for keyword in keywords {
table.add_row(row![keyword.keyword, keyword.raw, format!("{:.4}", keyword.score)]);
}
table.set_format(*format::consts::FORMAT_NO_BORDER_LINE_SEPARATOR);
table.printstd()
}

fn output_keywords_simple(keywords: &[ResultItem]) {
let mut table = Table::new();
table.set_titles(row!["keyword"]);
for keyword in keywords {
table.add_row(row![keyword.keyword]);
}
table.set_format(*format::consts::FORMAT_NO_BORDER_LINE_SEPARATOR);
table.printstd()
}

fn output_keywords_json(keywords: &[ResultItem]) {
match serde_json::to_string(&keywords) {
Ok(str) => {
println!("{}", str)
}
Err(e) => {
eprintln!("Unexpected error happened while trying to serialize result to json : {:?}", e);
std::process::exit(exit_code::SOFTWARE_ERROR)
}
}
}
1 change: 1 addition & 0 deletions yake_rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ contractions = "0.5.4"
segtok = "0.1.2"
levenshtein = "1.0.5"
indexmap = "2.7.0"
serde = { version = "1.0.217", optional = true }

[dev-dependencies]
divan = "0.1.17"
Expand Down
44 changes: 44 additions & 0 deletions yake_rust/src/context.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
use std::collections::HashMap;

use crate::counter::Counter;
use crate::UTerm;

/// Stats for a single term `T` against another terms.
#[derive(Default)]
pub struct PairwiseFreq<'s> {
/// How often `T` stands after: `A..T`
follows: Counter<&'s UTerm>,
/// How often `T` stands before: `T..A`
followed_by: Counter<&'s UTerm>,
}

#[derive(Default)]
pub struct Contexts<'s> {
map: HashMap<&'s UTerm, PairwiseFreq<'s>>,
}

impl<'s> Contexts<'s> {
pub fn track(&mut self, left: &'s UTerm, right: &'s UTerm) {
self.map.entry(right).or_default().follows.inc(left);
self.map.entry(left).or_default().followed_by.inc(right);
}

/// The total number of cases where `term` stands on the left side of `by`: `term .. by`
pub fn cases_term_is_followed(&self, term: &'s UTerm, by: &'s UTerm) -> usize {
self.map.get(&term).unwrap().followed_by.get(&by)
}

/// Value showing how divergent the surrounding of a term is.
/// The term may appear many times with the same words around, which means it's a fixed expression.
///
/// `0` is fixed, `1` is divergent.
pub fn diversity_of(&self, term: &'s UTerm) -> (f64, f64) {
match self.map.get(&term) {
None => (0., 0.),
Some(PairwiseFreq { follows: leftward, followed_by: rightward }) => (
if leftward.is_empty() { 0. } else { leftward.distinct() as f64 / leftward.total() as f64 },
if rightward.is_empty() { 0. } else { rightward.distinct() as f64 / rightward.total() as f64 },
),
}
}
}
Loading