diff --git a/Cargo.lock b/Cargo.lock index 74f6d47..a6480bd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -28,12 +28,56 @@ dependencies = [ "memchr", ] +[[package]] +name = "anstream" +version = "0.6.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + [[package]] name = "anstyle" version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" +dependencies = [ + "anstyle", + "once_cell", + "windows-sys 0.59.0", +] + [[package]] name = "arbitrary" version = "1.4.1" @@ -150,30 +194,51 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.26" +version = "4.5.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783" +checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796" dependencies = [ "clap_builder", + "clap_derive", ] [[package]] name = "clap_builder" -version = "4.5.26" +version = "4.5.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121" +checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7" dependencies = [ + "anstream", "anstyle", "clap_lex", + "strsim", "terminal_size", ] +[[package]] +name = "clap_derive" +version = "4.5.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "clap_lex" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + [[package]] name = "condtype" version = "1.3.0" @@ -249,6 +314,27 @@ dependencies = [ "typenum", ] +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + [[package]] name = "deflate64" version = "0.1.9" @@ -292,6 +378,27 @@ dependencies = [ "subtle", ] +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -334,6 +441,12 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + [[package]] name = "envmnt" version = "0.8.4" @@ -357,9 +470,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.59.0", ] +[[package]] +name = "exit-code" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf4cdb977193f2d7688525ca10d86199fe9bdd9db26b97ef490558128c643dc4" + [[package]] name = "fancy-regex" version = "0.14.0" @@ -429,6 +548,18 @@ version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "hmac" version = "0.12.1" @@ -462,9 +593,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -479,6 +610,23 @@ dependencies = [ "generic-array", ] +[[package]] +name = "is-terminal" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + [[package]] name = "itertools" version = "0.14.0" @@ -503,6 +651,12 @@ dependencies = [ "libc", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "levenshtein" version = "1.0.5" @@ -515,6 +669,16 @@ version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +[[package]] +name = "libredox" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +dependencies = [ + "bitflags", + "libc", +] + [[package]] name = "linked-hash-map" version = "0.5.6" @@ -647,6 +811,20 @@ dependencies = [ "yansi", ] +[[package]] +name = "prettytable-rs" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eea25e07510aa6ab6547308ebe3c036016d162b8da920dbb079e3ba8acf3d95a" +dependencies = [ + "csv", + "encode_unicode", + "is-terminal", + "lazy_static", + "term", + "unicode-width", +] + [[package]] name = "proc-macro2" version = "1.0.93" @@ -695,6 +873,17 @@ dependencies = [ "getrandom", ] +[[package]] +name = "redox_users" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +dependencies = [ + "getrandom", + "libredox", + "thiserror 1.0.69", +] + [[package]] name = "regex" version = "1.11.1" @@ -740,9 +929,15 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.59.0", ] +[[package]] +name = "rustversion" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" + [[package]] name = "rusty-hook" version = "0.11.2" @@ -794,9 +989,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.135" +version = "1.0.137" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9" +checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b" dependencies = [ "itoa", "memchr", @@ -846,6 +1041,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + [[package]] name = "subtle" version = "2.6.1" @@ -863,6 +1064,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "term" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f" +dependencies = [ + "dirs-next", + "rustversion", + "winapi", +] + [[package]] name = "terminal_size" version = "0.4.1" @@ -870,7 +1082,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5352447f921fda68cf61b4101566c0bdb5104eff6804d0678e5227580ab6a4e9" dependencies = [ "rustix", - "windows-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", ] [[package]] @@ -879,7 +1100,18 @@ version = "2.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" dependencies = [ - "thiserror-impl", + "thiserror-impl 2.0.11", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", ] [[package]] @@ -939,6 +1171,12 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + [[package]] name = "version_check" version = "0.9.5" @@ -951,6 +1189,37 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + [[package]] name = "windows-sys" version = "0.59.0" @@ -1024,18 +1293,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "yake" +version = "0.1.0" +dependencies = [ + "clap", + "exit-code", + "prettytable-rs", + "serde_json", + "yake-rust", +] + [[package]] name = "yake-rust" version = "0.2.0" dependencies = [ "contractions", "divan", - "indexmap 2.7.0", + "indexmap 2.7.1", "levenshtein", "pretty_assertions", "regex", "rusty-hook", "segtok", + "serde", "streaming-stats", "zip", ] @@ -1103,13 +1384,13 @@ dependencies = [ "displaydoc", "flate2", "hmac", - "indexmap 2.7.0", + "indexmap 2.7.1", "lzma-rs", "memchr", "pbkdf2", "rand", "sha1", - "thiserror", + "thiserror 2.0.11", "time", "zeroize", "zopfli", diff --git a/Cargo.toml b/Cargo.toml index cd4b45c..ff5bfe9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,6 @@ [workspace] resolver = "2" -members = [ - "yake_rust" -] +members = ["yake", "yake_rust"] [workspace.dependencies] rusty-hook = "0.11.2" diff --git a/README.md b/README.md index 62846fa..23bb9b8 100644 --- a/README.md +++ b/README.md @@ -92,3 +92,33 @@ Results: | learning | learning | 0.1621 | | goldbloom | Goldbloom | 0.1625 | | machine | machine | 0.1672 | + + +### CLI +`yake` is the CLI implementation of `yake_rust` + +#### Basic usage +```shell +$ cargo install --path yake +$ yake --input-file yake_rust/src/test_google.txt +``` +#### More options + +```shell +$ yake --help + +Usage: yake [OPTIONS] <--text-input |--input-file > + +Options: + --text-input Input text, SURROUNDED by single quotes(') + -i, --input-file Input file + -n, --ngram-size Max size of the ngram [default: 3] + --dedup-lim Deduplication limiter [default: 0.9] + --window-size Window size [default: 1] + -t, --top Number of keyphrases to extract + -v, --verbose Gets detailed information (such as the score) + -l, --language Language [default: en] + --json Dump output as JSON + -h, --help Print help + -V, --version Print version +``` \ No newline at end of file diff --git a/yake/Cargo.toml b/yake/Cargo.toml new file mode 100644 index 0000000..77d72d0 --- /dev/null +++ b/yake/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "yake" +version = "0.1.0" +edition = "2021" + +[dependencies] +yake-rust = { path = "../yake_rust" , features = ["serde"]} +clap = { version = "4.5.26", features = ["cargo", "derive", "string"] } +serde_json = "1.0.135" +exit-code = "1.0.0" +prettytable-rs = "0.10.0" diff --git a/yake/src/cli.rs b/yake/src/cli.rs new file mode 100644 index 0000000..6e91d77 --- /dev/null +++ b/yake/src/cli.rs @@ -0,0 +1,134 @@ +use std::{path::PathBuf, sync::LazyLock}; + +use clap::error::ErrorKind; +use clap::{command, Args}; +use clap::{CommandFactory, Parser}; +use yake_rust::{Config, StopWords}; + +static DEFAULT_CONFIG: LazyLock = LazyLock::new(Config::default); + +#[derive(Args)] +#[group(required = true, multiple = false)] +struct Input { + // -ti, --text_input TEXT + /// Input text + #[arg( + conflicts_with = "input_file", + long, + help = "Input text, SURROUNDED by single quotes(')", + value_name = "TEXT" + )] + text_input: Option, + + // -i, --input_file TEXT + /// Input file + #[arg(conflicts_with = "text_input", short, long, help = "Input file", value_name = "FILE")] + input_file: Option, +} + +// TODO +// -df, --dedup-func [leve|jaro|seqm] +// Deduplication function. + +#[derive(Parser)] +#[command(version, about, long_about = None)] +struct Cli { + #[command(flatten)] + input: Input, + + // -n, --ngram-size INTEGER + /// Max size of the ngram + #[arg(short, long, default_value_t = DEFAULT_CONFIG.ngrams, help = "Max size of the ngram", value_name = "INTEGER")] + ngram_size: usize, + + // -dl, --dedup-lim FLOAT + /// Deduplication limiter + #[arg(long, value_parser = parse_dedup, default_value_t = DEFAULT_CONFIG.deduplication_threshold, help = "Deduplication limiter", value_name = "FLOAT")] + dedup_lim: f64, + + // -ws, --window-size INTEGER + /// Window size + #[arg(long, default_value_t = DEFAULT_CONFIG.window_size, help = "Window size", value_name = "INTEGER")] + window_size: usize, + + // -t, --top INTEGER + /// Number of keyphrases to extract + #[arg(short, long, help = "Number of keyphrases to extract", value_name = "INTEGER")] + top: Option, + + // -v, --verbose + /// Gets detailed information (such as the score) + #[arg(short, long, help = "Gets detailed information (such as the score)")] + verbose: bool, + + // // --help + // /// Show this message and exit + // #[arg(short, long)] + // help: bool, + + // -l, --language TEXT + /// Language + #[arg(short, long, default_value= "en", value_parser = parse_language, help = "Language", value_name = "TEXT")] + language: StopWords, + + #[arg(long, help = "Dump output as JSON")] + json: bool, +} + +fn parse_language(cli_language: &str) -> Result { + StopWords::predefined(cli_language) + .ok_or_else(|| format!("Could not find language {}, did you enable this feature?", cli_language)) +} + +fn parse_dedup(cli_dedup_lim: &str) -> Result { + match cli_dedup_lim.parse::() { + Ok(value @ 0f64..=1f64) => Ok(value), + Ok(value) => Err(format!("{} is not in the 0..=1", value)), + Err(_) => Err("invalid digit found in string".into()), + } +} + +pub struct ParsedCli { + pub config: Config, + pub language: StopWords, + pub input: String, + pub json: bool, + pub top: Option, + pub verbose: bool, +} + +pub fn parse_cli() -> ParsedCli { + let cli = Cli::parse(); + + let input = match (cli.input.text_input, cli.input.input_file) { + (None, None) | (Some(_), Some(_)) => { + panic!("clap should ensure that either text-input or input-file is specified") + } + (None, Some(path_to_file)) => match std::fs::read_to_string(&path_to_file) { + Ok(text) => text, + Err(err) => { + Cli::command() + .error( + ErrorKind::ValueValidation, + format!("Error reading file `{}`: {:?}", path_to_file.display(), err), + ) + .exit(); + } + }, + (Some(text), None) => text, + }; + + ParsedCli { + config: Config { + ngrams: cli.ngram_size, + window_size: cli.window_size, + deduplication_threshold: cli.dedup_lim, + ..Config::default() + }, + language: cli.language, + input, + json: cli.json, + verbose: cli.verbose, + top: cli.top, + } +} diff --git a/yake/src/main.rs b/yake/src/main.rs new file mode 100644 index 0000000..0f1b75f --- /dev/null +++ b/yake/src/main.rs @@ -0,0 +1,64 @@ +use cli::{parse_cli, ParsedCli}; +use prettytable::{format, row, Table}; +use yake_rust::{ResultItem, Yake}; + +mod cli; + +fn main() { + let ParsedCli { language, json, input, config, top, verbose } = parse_cli(); + + let now = std::time::Instant::now(); + + let keywords = Yake::new(language, config).get_n_best(&input, top); + + output_keywords(&keywords, json, verbose); + if verbose { + eprintln!("Elapsed: {:.2?}", now.elapsed()); + } +} + +fn output_keywords(keywords: &[ResultItem], json: bool, verbose: bool) { + match (json, verbose) { + (true, _) => { + output_keywords_json(&keywords); + } + (false, true) => { + output_keywords_verbose(&keywords); + } + (false, false) => { + output_keywords_simple(&keywords); + } + } +} + +fn output_keywords_verbose(keywords: &[ResultItem]) { + let mut table = Table::new(); + table.set_titles(row!["keyword", "raw", "score"]); + for keyword in keywords { + table.add_row(row![keyword.keyword, keyword.raw, format!("{:.4}", keyword.score)]); + } + table.set_format(*format::consts::FORMAT_NO_BORDER_LINE_SEPARATOR); + table.printstd() +} + +fn output_keywords_simple(keywords: &[ResultItem]) { + let mut table = Table::new(); + table.set_titles(row!["keyword"]); + for keyword in keywords { + table.add_row(row![keyword.keyword]); + } + table.set_format(*format::consts::FORMAT_NO_BORDER_LINE_SEPARATOR); + table.printstd() +} + +fn output_keywords_json(keywords: &[ResultItem]) { + match serde_json::to_string(&keywords) { + Ok(str) => { + println!("{}", str) + } + Err(e) => { + eprintln!("Unexpected error happened while trying to serialize result to json : {:?}", e); + std::process::exit(exit_code::SOFTWARE_ERROR) + } + } +} diff --git a/yake_rust/Cargo.toml b/yake_rust/Cargo.toml index 75e7711..789b86a 100644 --- a/yake_rust/Cargo.toml +++ b/yake_rust/Cargo.toml @@ -58,6 +58,7 @@ contractions = "0.5.4" segtok = "0.1.2" levenshtein = "1.0.5" indexmap = "2.7.0" +serde = { version = "1.0.217", optional = true } [dev-dependencies] divan = "0.1.17" diff --git a/yake_rust/src/context.rs b/yake_rust/src/context.rs new file mode 100644 index 0000000..0f79e46 --- /dev/null +++ b/yake_rust/src/context.rs @@ -0,0 +1,44 @@ +use std::collections::HashMap; + +use crate::counter::Counter; +use crate::UTerm; + +/// Stats for a single term `T` against another terms. +#[derive(Default)] +pub struct PairwiseFreq<'s> { + /// How often `T` stands after: `A..T` + follows: Counter<&'s UTerm>, + /// How often `T` stands before: `T..A` + followed_by: Counter<&'s UTerm>, +} + +#[derive(Default)] +pub struct Contexts<'s> { + map: HashMap<&'s UTerm, PairwiseFreq<'s>>, +} + +impl<'s> Contexts<'s> { + pub fn track(&mut self, left: &'s UTerm, right: &'s UTerm) { + self.map.entry(right).or_default().follows.inc(left); + self.map.entry(left).or_default().followed_by.inc(right); + } + + /// The total number of cases where `term` stands on the left side of `by`: `term .. by` + pub fn cases_term_is_followed(&self, term: &'s UTerm, by: &'s UTerm) -> usize { + self.map.get(&term).unwrap().followed_by.get(&by) + } + + /// Value showing how divergent the surrounding of a term is. + /// The term may appear many times with the same words around, which means it's a fixed expression. + /// + /// `0` is fixed, `1` is divergent. + pub fn diversity_of(&self, term: &'s UTerm) -> (f64, f64) { + match self.map.get(&term) { + None => (0., 0.), + Some(PairwiseFreq { follows: leftward, followed_by: rightward }) => ( + if leftward.is_empty() { 0. } else { leftward.distinct() as f64 / leftward.total() as f64 }, + if rightward.is_empty() { 0. } else { rightward.distinct() as f64 / rightward.total() as f64 }, + ), + } + } +} diff --git a/yake_rust/src/lib.rs b/yake_rust/src/lib.rs index df60a26..2a81a0c 100644 --- a/yake_rust/src/lib.rs +++ b/yake_rust/src/lib.rs @@ -7,12 +7,15 @@ use std::iter::FromIterator; use indexmap::{IndexMap, IndexSet}; use plural_helper::PluralHelper; use preprocessor::{split_into_sentences, split_into_words}; +#[cfg(feature = "serde")] +use serde; use stats::{mean, median, stddev}; -use crate::counter::Counter; +use crate::context::Contexts; use crate::levenshtein::levenshtein_ratio; pub use crate::stopwords::StopWords; +mod context; mod counter; mod levenshtein; mod plural_helper; @@ -32,7 +35,6 @@ type Sentences = Vec; type Candidates<'s> = IndexMap<&'s [LTerm], Candidate<'s>>; type Features<'s> = HashMap<&'s LTerm, TermStats>; type Words<'s> = HashMap<&'s UTerm, Vec>>; -type Contexts<'s> = HashMap<&'s UTerm, (Counter<&'s UTerm>, Counter<&'s UTerm>)>; #[derive(Debug, Copy, Clone, Eq, PartialEq)] enum Tag { @@ -92,10 +94,6 @@ struct TermStats { position: f64, /// Normalized term frequency heuristic frequency: f64, - /// Left dispersion - dl: f64, - /// Right dispersion - dr: f64, /// Term relatedness to context relatedness: f64, /// Term's different sentences heuristic @@ -105,6 +103,7 @@ struct TermStats { } #[derive(PartialEq, Clone, Debug)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct ResultItem { pub raw: String, pub keyword: LTerm, @@ -135,6 +134,7 @@ struct Candidate<'s> { } #[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Config { /// The number of n-grams. /// @@ -153,6 +153,11 @@ pub struct Config { /// /// The [original implementation](https://github.com/LIAAD/) sticks with `true`. pub strict_capital: bool, + + /// When `true`, key phrases are allowed to have only alphanumeric characters and hyphen. + pub only_alphanumeric_and_hyphen: bool, + /// Key phrases can't be too short, less than `minimum_chars` in total. + pub minimum_chars: usize, } impl Default for Config { @@ -164,11 +169,14 @@ impl Default for Config { ngrams: 3, remove_duplicates: true, strict_capital: true, + only_alphanumeric_and_hyphen: false, + minimum_chars: 3, } } } #[derive(Debug, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Yake { config: Config, stop_words: StopWords, @@ -187,7 +195,6 @@ impl Yake { let features = self.extract_features(&context, vocabulary, &sentences); let mut ngrams: Candidates = self.ngram_selection(self.config.ngrams, &sentences); - self.filter_candidates(&mut ngrams, 3, false); Yake::candidate_weighting(features, &context, &mut ngrams); let mut results = ngrams @@ -282,7 +289,7 @@ impl Yake { /// a given term and its predecessor AND a given term and its subsequent term, /// found within a window of a given size. fn build_context<'s>(&self, sentences: &'s [Sentence]) -> Contexts<'s> { - let mut contexts = Contexts::default(); + let mut ctx = Contexts::default(); for sentence in sentences { let mut window: VecDeque<(&String, &UTerm)> = VecDeque::with_capacity(self.config.window_size + 1); @@ -302,8 +309,7 @@ impl Yake { continue; } - contexts.entry(term).or_default().0.inc(left_uterm); // term: [.., ->left] - contexts.entry(left_uterm).or_default().1.inc(term); // left: [.., ->term] + ctx.track(left_uterm, term); } } @@ -314,7 +320,7 @@ impl Yake { } } - contexts + ctx } fn is_d_tagged(&self, word: &str) -> bool { @@ -355,7 +361,7 @@ impl Yake { /// Computes local statistic features that extract informative content within the text /// to calculate the importance of single terms. - fn extract_features<'s>(&self, contexts: &Contexts, words: Words<'s>, sentences: &'s Sentences) -> Features<'s> { + fn extract_features<'s>(&self, ctx: &Contexts, words: Words<'s>, sentences: &'s Sentences) -> Features<'s> { let tf = words.values().map(Vec::len); let words_nsw: HashMap<&UTerm, usize> = sentences @@ -450,14 +456,8 @@ impl Yake { } { - if let Some((leftward, rightward)) = contexts.get(&u_term) { - stats.dl = - if leftward.is_empty() { 0. } else { leftward.distinct() as f64 / leftward.total() as f64 }; - stats.dr = - if rightward.is_empty() { 0. } else { rightward.distinct() as f64 / rightward.total() as f64 }; - } - - stats.relatedness = 1.0 + (stats.dr + stats.dl) * (stats.tf / max_tf); + let (dl, dr) = ctx.diversity_of(u_term); + stats.relatedness = 1.0 + (dr + dl) * (stats.tf / max_tf); } { @@ -476,7 +476,7 @@ impl Yake { features } - fn candidate_weighting<'s>(features: Features<'s>, contexts: &Contexts<'s>, candidates: &mut Candidates<'s>) { + fn candidate_weighting<'s>(features: Features<'s>, ctx: &Contexts<'s>, candidates: &mut Candidates<'s>) { for candidate in candidates.values_mut() { let lc_terms = candidate.lc_terms; let uq_terms = candidate.uq_terms; @@ -491,19 +491,21 @@ impl Yake { let mut prob_succ = 0.0; if 0 < j { // Not the first term - // #previous term occuring before this one / #previous term + // #previous term occurring before this one / #previous term let prev_uq = uq_terms.get(j - 1).unwrap(); let prev_lc = lc_terms.get(j - 1).unwrap(); - let prev_into_stopword = contexts.get(&prev_uq).unwrap().1.get(&uq); - prob_prev = prev_into_stopword as f64 / features.get(&prev_lc).unwrap().tf; + prob_prev = + ctx.cases_term_is_followed(&prev_uq, &uq) as f64 / features.get(&prev_lc).unwrap().tf; } if j < uq_terms.len() { // Not the last term - // #next term occuring after this one / #next term + // #next term occurring after this one / #next term let next_uq = uq_terms.get(j + 1).unwrap(); let next_lc = lc_terms.get(j + 1).unwrap(); - let stopword_into_next = contexts.get(&uq).unwrap().1.get(&next_uq); - prob_succ = stopword_into_next as f64 / features.get(&next_lc).unwrap().tf; + prob_succ = + ctx.cases_term_is_followed(&uq, &next_uq) as f64 / features.get(&next_lc).unwrap().tf; + // fixme: Probability P(T[i+1] | T[i]) is weird. + // Why divide by Fr(T[i]) at first, but by Fr(T[i+1]) at second? } let prob = prob_prev * prob_succ; @@ -524,48 +526,52 @@ impl Yake { } } - fn filter_candidates( - &self, - candidates: &mut Candidates, - minimum_length: usize, - only_alphanumeric_and_hyphen: bool, // could be a function - ) { - // fixme: filter right before inserting into the set to optimize - candidates.retain(|_k, v| !{ - let lc_terms = v.lc_terms; - let lc_words: HashSet<<erm> = HashSet::from_iter(lc_terms); - - let has_float = || lc_words.iter().any(|&w| self.is_d_tagged(w)); - let has_stop_word = || self.is_stopword(&lc_terms[0]) || self.is_stopword(lc_terms.last().unwrap()); - let has_unparsable = || lc_words.iter().any(|&w| self.is_u_tagged(w)); - let not_enough_symbols = || lc_words.iter().map(|w| w.chars().count()).sum::() < minimum_length; - let has_non_alphanumeric = - || only_alphanumeric_and_hyphen && !lc_words.iter().all(word_is_alphanumeric_and_hyphen); - - // remove candidate if - has_float() || has_stop_word() || has_unparsable() || not_enough_symbols() || has_non_alphanumeric() - }); + fn is_candidate(&self, lc_terms: &[LTerm]) -> bool { + let lc_words: HashSet<<erm> = HashSet::from_iter(lc_terms); + + let has_float = || lc_words.iter().any(|&w| self.is_d_tagged(w)); + let has_stop_word = || self.is_stopword(&lc_terms[0]) || self.is_stopword(lc_terms.last().unwrap()); + let has_unparsable = || lc_words.iter().any(|&w| self.is_u_tagged(w)); + let not_enough_symbols = + || lc_terms.iter().map(|w| w.chars().count()).sum::() < self.config.minimum_chars; + let has_non_alphanumeric = + || self.config.only_alphanumeric_and_hyphen && !lc_words.iter().all(word_is_alphanumeric_and_hyphen); + + !{ has_float() || has_stop_word() || has_unparsable() || not_enough_symbols() || has_non_alphanumeric() } } fn ngram_selection<'s>(&self, n: usize, sentences: &'s Sentences) -> Candidates<'s> { let mut candidates = Candidates::new(); + let mut ignored = HashSet::new(); + for sentence in sentences.iter() { let length = sentence.words.len(); + for j in 0..length { for k in (j + 1..length + 1).take(n) { if (j..k).is_empty() { continue; } - let lc_words = &sentence.lc_terms[j..k]; - let candidate = candidates.entry(lc_words).or_default(); + let lc_terms = &sentence.lc_terms[j..k]; + + if ignored.contains(lc_terms) { + continue; + } + // todo: optimize: if some checks have failed, we may skip ngrams, by j += k + if !self.is_candidate(lc_terms) { + ignored.insert(lc_terms); + continue; + } + let candidate = candidates.entry(lc_terms).or_default(); + candidate.lc_terms = lc_terms; candidate.occurrences.push(&sentence.words[j..k]); - candidate.lc_terms = lc_words; candidate.uq_terms = &sentence.uq_terms[j..k]; } } } + candidates } @@ -583,784 +589,4 @@ fn word_is_alphanumeric_and_hyphen(word: impl AsRef) -> bool { } #[cfg(test)] -mod tests { - use pretty_assertions::assert_eq; - - use super::*; - - #[test] - fn short() { - let text = "this is a keyword"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(1)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [("keyword", "keyword", 0.1583)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn order() { - // Verifies that order of keywords with the same score is preserved. - // If not, this test becomes unstable. - let text = "Machine learning"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(3)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [("Machine", "machine", 0.1583), ("learning", "learning", 0.1583)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn laptop() { - let text = "Do you need an Apple laptop?"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(2)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [("Apple", "apple", 0.1448), ("laptop", "laptop", 0.1583)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn headphones() { - let text = "Do you like headphones? \ - Starting this Saturday, we will be kicking off a huge sale of headphones! \ - If you need headphones, we've got you covered!"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(3)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = - [("headphones", "headphones", 0.1141), ("Saturday", "saturday", 0.2111), ("Starting", "starting", 0.4096)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn multi_ngram() { - let text = "I will give you a great deal if you just read this!"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(1)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [("great deal", "great deal", 0.0257)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn singular() { - let text = "One smartwatch. One phone. Many phone."; // Weird grammar; to compare with the "plural" test - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(2)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.2474)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn plural() { - let text = "One smartwatch. One phone. Many phones."; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(3)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.4949), ("phones", "phones", 0.4949)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn non_hyphenated() { - let text = "Truly high tech!"; // For comparison with the "hyphenated" test - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(1)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [("high tech", "high tech", 0.0494)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn hyphenated() { - let text = "Truly high-tech!"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(1)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [("high-tech", "high-tech", 0.1583)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn weekly_newsletter_short() { - let text = "This is your weekly newsletter!"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(3)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("weekly newsletter", "weekly newsletter", 0.0494), - ("newsletter", "newsletter", 0.1583), - ("weekly", "weekly", 0.2974), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn weekly_newsletter_long() { - let text = "This is your weekly newsletter! \ - Hundreds of great deals - everything from men's fashion \ - to high-tech drones!"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(5)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("weekly newsletter", "weekly newsletter", 0.0780), - ("newsletter", "newsletter", 0.2005), - ("weekly", "weekly", 0.3607), - ("great deals", "great deals", 0.4456), - ("high-tech drones", "high-tech drones", 0.4456), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn weekly_newsletter_long_with_paragraphs() { - let text = "This is your weekly newsletter!\n\n \ - \tHundreds of great deals - everything from men's fashion \n\ - to high-tech drones!"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(5)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("weekly newsletter", "weekly newsletter", 0.0780), - ("newsletter", "newsletter", 0.2005), - ("weekly", "weekly", 0.3607), - ("great deals", "great deals", 0.4456), - ("high-tech drones", "high-tech drones", 0.4456), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn composite_recurring_words_and_bigger_window() { - let text = "Machine learning is a growing field. Few research fields grow as much as machine learning grows."; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = - Yake::new(stopwords, Config { ngrams: 2, window_size: 2, ..Default::default() }).get_n_best(text, Some(5)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Machine learning", "machine learning", 0.1346), - ("growing field", "growing field", 0.1672), - ("learning", "learning", 0.2265), - ("Machine", "machine", 0.2341), - ("growing", "growing", 0.2799), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn composite_recurring_words_near_numbers() { - let text = "I buy 100 yellow bananas every day. Every night I eat bananas - all but 5 bananas."; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(3)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = - [("yellow bananas", "yellow bananas", 0.0682), ("buy", "buy", 0.1428), ("yellow", "yellow", 0.1428)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn composite_recurring_words_near_spelled_out_numbers() { - // For comparison with "composite_recurring_words_near_numbers" to see if numbers cause - let text = "I buy a hundred yellow bananas every day. Every night I eat bananas - all but five bananas."; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(3)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("hundred yellow", "hundred yellow", 0.0446), - ("yellow bananas", "yellow bananas", 0.1017), - ("day", "day", 0.1428), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn with_stopword_in_the_middle() { - let text = "Game of Thrones"; - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = - Yake::new(stopwords, Config { remove_duplicates: false, ..Config::default() }).get_n_best(text, Some(1)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [("Game of Thrones", "game of thrones", 0.01380)]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn google_sample_single_ngram() { - let text = include_str!("test_google.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Google", "google", 0.0251), - ("Kaggle", "kaggle", 0.0273), - ("data", "data", 0.08), - ("science", "science", 0.0983), - ("platform", "platform", 0.124), - ("service", "service", 0.1316), - ("acquiring", "acquiring", 0.1511), - ("learning", "learning", 0.1621), - ("Goldbloom", "goldbloom", 0.1625), - ("machine", "machine", 0.1672), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn google_sample_defaults() { - let text = include_str!("test_google.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Google", "google", 0.0251), - ("Kaggle", "kaggle", 0.0273), - ("CEO Anthony Goldbloom", "ceo anthony goldbloom", 0.0483), - ("data science", "data science", 0.055), - ("acquiring data science", "acquiring data science", 0.0603), - ("Google Cloud Platform", "google cloud platform", 0.0746), - ("data", "data", 0.08), - ("San Francisco", "san francisco", 0.0914), - ("Anthony Goldbloom declined", "anthony goldbloom declined", 0.0974), - ("science", "science", 0.0983), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn gitter_sample_defaults() { - let text = include_str!("test_gitter.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Gitter", "gitter", 0.0190), - ("GitLab", "gitlab", 0.0478), - ("acquires software chat", "acquires software chat", 0.0479), - ("chat startup Gitter", "chat startup gitter", 0.0512), - ("software chat startup", "software chat startup", 0.0612), - ("Gitter chat", "gitter chat", 0.0684), - ("GitLab acquires software", "gitlab acquires software", 0.0685), - ("startup", "startup", 0.0783), - ("software", "software", 0.0879), - ("code", "code", 0.0879), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn genius_sample_defaults() { - let text = include_str!("test_genius.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Genius", "genius", 0.0261), - ("company", "company", 0.0263), - ("Genius quietly laid", "genius quietly laid", 0.027), - ("company quietly laid", "company quietly laid", 0.0392), - ("media company", "media company", 0.0404), - ("Lehman", "lehman", 0.0412), - ("quietly laid", "quietly laid", 0.0583), - ("Tom Lehman told", "tom lehman told", 0.0603), - ("video", "video", 0.0650), - ("co-founder Tom Lehman", "co-founder tom lehman", 0.0669), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn german_sample_defaults() { - let text = include_str!("test_german.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("de").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Vereinigten Staaten", "vereinigten staaten", 0.0152), // LIAAD REFERENCE: 0.151 - ("Präsidenten Donald Trump", "präsidenten donald trump", 0.0182), - ("Donald Trump", "donald trump", 0.0211), // LIAAD REFERENCE: 0.21 - ("trifft Donald Trump", "trifft donald trump", 0.0231), // LIAAD REFERENCE: 0.23 - ("Trump", "trump", 0.0240), - ("Trumps Finanzminister Steven", "trumps finanzminister steven", 0.0243), - ("Kanzlerin Angela Merkel", "kanzlerin angela merkel", 0.0275), // LIAAD REFERENCE: 0.273 - ("deutsche Kanzlerin Angela", "deutsche kanzlerin angela", 0.0316), // LIAAD REFERENCE: 0.314 - ("Merkel trifft Donald", "merkel trifft donald", 0.0353), // LIAAD REFERENCE: 0.351 - ("Exportnation Deutschland", "exportnation deutschland", 0.038), // LIAAD REFERENCE: 0.0379 - ]; - - // REASONS FOR DISCREPANCY: - // - The text contains both "bereit" ("ready") and "bereits" ("already"). - // While "bereits" is a stopword, "bereit" is not. - // LIAAD/yake keeps track of whether a term is a stopword or not - // in a key-value mapping, where the key is the term, lowercase, plural-normalized. - // (Note that the plural normalization techique used is rarely effective in German.) - // Since "bereits" occurs before "bereit" in the text, LIAAD/yake sees it, - // recognizes it is a stopword, and stores it under the key "bereit". Later, - // when it encounters "bereit" (NOT a stopword), it already has that key in its - // mapping so it looks it up and finds that it is a keyword (which it is not). - // Meanwhile, yake-rust does not have such a key-value store, so it correctly - // recognizes "bereits" as a stopword and "bereit" as a non-stopword. The extra - // inclusion of "bereit" in the non-stopwords affects the TF statistics and thus - // the frequency contribution to the weights, leading to slightly different scores. - - assert_eq!(actual, expected); - } - - #[test] - fn dutch_sample_defaults() { - let text = include_str!("test_nl.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("nl").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Vincent van Gogh", "vincent van gogh", 0.0111), - ("Gogh Museum", "gogh museum", 0.0125), - ("Gogh", "gogh", 0.0150), - ("Museum", "museum", 0.0438), - ("brieven", "brieven", 0.0635), - ("Vincent", "vincent", 0.0643), - ("Goghs schilderijen", "goghs schilderijen", 0.1009), - ("Gogh verging", "gogh verging", 0.1215), - ("Goghs", "goghs", 0.1651), - ("schrijven", "schrijven", 0.1704), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn finnish_sample_defaults() { - let text = include_str!("test_fi.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("fi").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Mobile Networks", "mobile networks", 0.0043), - ("Nokia tekee muutoksia", "nokia tekee muutoksia", 0.0061), - ("tekee muutoksia organisaatioonsa", "tekee muutoksia organisaatioonsa", 0.0065), - ("johtokuntaansa vauhdittaakseen yhtiön", "johtokuntaansa vauhdittaakseen yhtiön", 0.0088), - ("vauhdittaakseen yhtiön strategian", "vauhdittaakseen yhtiön strategian", 0.0088), - ("yhtiön strategian toteuttamista", "yhtiön strategian toteuttamista", 0.0092), - ("Networks", "networks", 0.0102), - ("Networks and Applications", "networks and applications", 0.0113), - ("strategian toteuttamista Nokia", "strategian toteuttamista nokia", 0.0127), - ("siirtyy Mobile Networks", "siirtyy mobile networks", 0.0130), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn italian_sample_defaults() { - let text = include_str!("test_it.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("it").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(5)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Champions League", "champions league", 0.0390), - ("Quarti", "quarti", 0.0520), - ("Atlético Madrid", "atlético madrid", 0.0592), - ("Ottavi di finale", "ottavi di finale", 0.0646), - ("Real Madrid", "real madrid", 0.0701), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn french_sample_defaults() { - let text = include_str!("test_fr.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("fr").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("dégrade en France", "dégrade en france", 0.0254), - ("jusque-là uniquement associée", "jusque-là uniquement associée", 0.0504), - ("sondage Ifop réalisé", "sondage ifop réalisé", 0.0554), - ("religion se dégrade", "religion se dégrade", 0.091), - ("France", "france", 0.0941), - ("l'extrême droite", "l'extrême droite", 0.0997), - ("sondage Ifop", "sondage ifop", 0.101), - ("Islam", "islam", 0.1021), - ("musulmane en France", "musulmane en france", 0.1078), - ("Allemagne", "allemagne", 0.1086), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - #[ignore = "Crashes due to failed unwrap"] - fn portuguese_sport_sample_defaults() { - let text = include_str!("test_pt_1.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("pt").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("seleção brasileira treinará", "seleção brasileira treinará", 0.0072), - ("seleção brasileira", "seleção brasileira", 0.0100), - ("Seleção Brasileira visando", "seleção brasileira visando", 0.0192), - ("Seleção Brasileira encara", "seleção brasileira encara", 0.0344), - ("brasileira treinará", "brasileira treinará", 0.0373), - ("Renato Augusto", "renato augusto", 0.0376), - ("Copa da Rússia", "copa da rússia", 0.0407), - ("seleção", "seleção", 0.0454), - ("brasileira", "brasileira", 0.0528), - ]; - - assert_eq!(actual, expected); - } - - #[test] - fn portuguese_tourism_sample_defaults() { - let text = include_str!("test_pt_2.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("pt").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Alvor", "alvor", 0.0165), - ("Rio Alvor", "rio alvor", 0.0336), - ("Ria de Alvor", "ria de alvor", 0.0488), - ("encantadora vila", "encantadora vila", 0.0575), - ("Algarve", "algarve", 0.0774), - ("impressionantes de Portugal", "impressionantes de portugal", 0.0844), - ("estuário do Rio", "estuário do rio", 0.0907), - ("vila", "vila", 0.1017), - ("Ria", "ria", 0.1053), - ("Oceano Atlântico", "oceano atlântico", 0.1357), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn spanish_sample_defaults() { - let text = include_str!("test_es.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("es").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Guerra Civil Española", "guerra civil española", 0.0032), - ("Guerra Civil", "guerra civil", 0.0130), - ("Civil Española", "civil española", 0.0153), - ("Partido Socialista Obrero", "partido socialista obrero", 0.0283), - ("empezó la Guerra", "empezó la guerra", 0.0333), - ("Socialista Obrero Español", "socialista obrero español", 0.0411), - ("José Castillo", "josé castillo", 0.0426), - ("Española", "española", 0.0566), - ("José Antonio Primo", "josé antonio primo", 0.0589), - ("José Calvo Sotelo", "josé calvo sotelo", 0.0596), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn polish_sample_defaults() { - let text = include_str!("test_pl.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("pl").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("franka", "franka", 0.0328), - ("Geerta Wildersa VVD", "geerta wildersa vvd", 0.0346), - ("Geerta Wildersa", "geerta wildersa", 0.0399), - ("kurs franka", "kurs franka", 0.0486), - ("partii Geerta Wildersa", "partii geerta wildersa", 0.0675), - ("proc", "proc", 0.0692), - ("mld", "mld", 0.0724), - ("Narodowego Banku Szwajcarii", "narodowego banku szwajcarii", 0.0728), - ("kurs franka poniżej", "kurs franka poniżej", 0.0758), - ("Wildersa", "wildersa", 0.0765), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn turkish_sample_defaults() { - let text = include_str!("test_tr.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("tr").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("OECD", "oecd", 0.0176), // LIAAD REFERENCE: 0.0178 - ("Tek Bakışta Eğitim", "tek bakışta eğitim", 0.0232), // LIAAD REFERENCE: 0.0236 - ("eğitim", "eğitim", 0.0274), // LIAAD REFERENCE: 0.0278 - ("OECD eğitim endeksi", "oecd eğitim endeksi", 0.0313), // LIAAD REFERENCE: 0.0323 - ("OECD ortalamasının", "oecd ortalamasının", 0.0375), // LIAAD REFERNENCE: 0.0383 - ("Kalkınma Örgütü'nün", "kalkınma örgütü'nün", 0.0449), // LIAAD REFERENCE: 0.045 - ("Tek Bakışta", "tek bakışta", 0.0449), // LIAAD REFERENCE: 0.045 - ("İşbirliği ve Kalkınma", "i̇şbirliği ve kalkınma", 0.0468), - ("Türkiye'de", "türkiye'de", 0.0476), // LIAAD REFERENCE: 0.0480 - ("yüksek", "yüksek", 0.0509), // LIAAD REFERENCE: 0.0513 - ]; - - // REASONS FOR DISCREPANCY: - // - Difference in tokenization. - - assert_eq!(actual, expected); - } - - #[test] - fn arabic_sample_defaults() { - let text = include_str!("test_ar.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("ar").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("عبد السلام العجيلي", "عبد السلام العجيلي", 0.0105), - ("اللغة العربية الأربعاء", "اللغة العربية الأربعاء", 0.0139), - ("عبد النبي اصطيف", "عبد النبي اصطيف", 0.0142), - ("العجيلي في مرآة", "العجيلي في مرآة", 0.0177), - ("مرآة النقد المقارن", "مرآة النقد المقارن", 0.0183), // LIAAD REFERENCE: 0.018 - ("السلام العجيلي", "السلام العجيلي", 0.0198), - ("اللغة العربية", "اللغة العربية", 0.0207), - ("مرآة النقد", "مرآة النقد", 0.0255), // LIAAD REFERENCE: 0.025 - ("اللغة العربية بدمشق", "اللغة العربية بدمشق", 0.0261), - ("مجمع اللغة العربية", "مجمع اللغة العربية", 0.0281), - ]; - - assert_eq!(actual, expected); - } - - #[test] - fn dataset_text_1_defaults() { - let text = include_str!("test_data_1.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("pt").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Médio Oriente continua", "médio oriente continua", 0.0008), - ("Médio Oriente", "médio oriente", 0.0045), - ("Oriente continua", "oriente continua", 0.0117), - ("registar-se violentos confrontos", "registar-se violentos confrontos", 0.0178), - ("Faixa de Gaza", "faixa de gaza", 0.0268), - ("fogo hoje voltaram", "fogo hoje voltaram", 0.0311), - ("voltaram a registar-se", "voltaram a registar-se", 0.0311), - ("registar-se violentos", "registar-se violentos", 0.0311), - ("Exército israelita", "exército israelita", 0.0368), - ("Exército israelita voltou", "exército israelita voltou", 0.0639), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn dataset_text_2_defaults() { - let text = include_str!("test_data_2.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(5)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("highly radioactive water", "highly radioactive water", 0.0006), - ("crippled nuclear plant", "crippled nuclear plant", 0.0006), - ("ocean Japan official", "ocean japan official", 0.0031), - ("Japan official", "japan official", 0.0046), - ("official says highly", "official says highly", 0.0050), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn dataset_text_3_defaults() { - let text = include_str!("test_data_3.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(5)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Global Crossing", "global crossing", 0.0034), - ("Hutchison Telecommunications", "hutchison telecommunications", 0.0053), - ("Telecommunications and Singapore", "telecommunications and singapore", 0.0072), - ("Singapore Technologies", "singapore technologies", 0.0072), - ("Technologies take control", "technologies take control", 0.0157), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn dataset_text_4_defaults() { - let text = include_str!("test_data_4.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("annual revenues increasing", "annual revenues increasing", 0.0018), - ("retail inventory management", "retail inventory management", 0.0023), - ("Dollar General", "dollar general", 0.0034), - ("inventory management", "inventory management", 0.0112), - ("perpetual progress", "perpetual progress", 0.0133), - ("revenues increasing", "revenues increasing", 0.0133), - ("fast track", "fast track", 0.0133), - ("road to perpetual", "road to perpetual", 0.0159), - ("annual revenues", "annual revenues", 0.0168), - ("stores opened", "stores opened", 0.0168), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn dataset_text_5_defaults() { - let text = include_str!("test_data_5.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Handoff Trigger Table", "handoff trigger table", 0.0006), // LIAAD REFERENCE: 0.0007 - ("Handoff", "handoff", 0.0010), - ("WLAN Networks ABSTRACT", "wlan networks abstract", 0.0019), - ("Vertical handoff", "vertical handoff", 0.0020), - ("Handoff Trigger", "handoff trigger", 0.0021), - ("proactive handoff scheme", "proactive handoff scheme", 0.0021), - ("HTT Method Figure", "htt method figure", 0.0022), - ("WLAN", "wlan", 0.0023), - ("ABSTRACT Vertical handoff", "abstract vertical handoff", 0.0030), - ("traditional handoff scheme", "traditional handoff scheme", 0.0033), - ]; - - // REASONS FOR DISCREPANCY: - // - Difference in sentence splitting. - - assert_eq!(actual, expected); - } - - #[test] - fn dataset_text_6_defaults() { - let text = include_str!("test_data_6.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("MRSA", "mrsa", 0.0047), - ("TSN Database", "tsn database", 0.0107), - ("methicillin-resistant Staphylococcus aureus", "methicillin-resistant staphylococcus aureus", 0.0116), - ("rates of MRSA", "rates of mrsa", 0.0145), - ("Staphylococcus aureus", "staphylococcus aureus", 0.0167), - ("methicillin-resistant Staphylococcus", "methicillin-resistant staphylococcus", 0.0177), - ("prevalence of MRSA", "prevalence of mrsa", 0.0201), - ("MRSA infections", "mrsa infections", 0.0218), - ("MRSA infections detected", "mrsa infections detected", 0.0223), - ("TSN", "tsn", 0.0250), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } - - #[test] - fn dataset_text_7_defaults() { - let text = include_str!("test_data_7.txt"); // LIAAD/yake sample text - let stopwords = StopWords::predefined("en").unwrap(); - let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10)); - // leave only 4 digits - actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); - let expected = [ - ("Environment Design Level", "environment design level", 0.0008), - ("Jerusalem Jerusalem", "jerusalem jerusalem", 0.0009), - ("Dynamics Based Control", "dynamics based control", 0.0011), - ("system dynamics", "system dynamics", 0.0017), - ("DBC", "dbc", 0.0019), - ("target system dynamics", "target system dynamics", 0.0019), - ("target dynamics", "target dynamics", 0.0023), - ("Science Bar Ilan", "science bar ilan", 0.0025), - ("EMT", "emt", 0.0026), - ("Dynamics", "dynamics", 0.0026), - ]; - // Results agree with reference implementation LIAAD/yake - - assert_eq!(actual, expected); - } -} +mod tests; diff --git a/yake_rust/src/stopwords/mod.rs b/yake_rust/src/stopwords/mod.rs index c7c0337..699c68d 100644 --- a/yake_rust/src/stopwords/mod.rs +++ b/yake_rust/src/stopwords/mod.rs @@ -8,6 +8,7 @@ use crate::LTerm; /// The list is used to mark potentially meaningless tokens and generally based on the language /// given as input. Tokens with fewer than three characters are also considered a stopword. #[derive(Debug, Default, Clone)] +#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct StopWords { set: HashSet, } diff --git a/yake_rust/src/tests.rs b/yake_rust/src/tests.rs new file mode 100644 index 0000000..c8f6863 --- /dev/null +++ b/yake_rust/src/tests.rs @@ -0,0 +1,763 @@ +use pretty_assertions::assert_eq; + +use super::*; + +fn test(text: &str, lang: &str, cfg: Config, n_best: Option, expected: [(&str, &str, f64); T]) { + let stopwords = StopWords::predefined(lang).unwrap(); + let mut actual = Yake::new(stopwords, cfg).get_n_best(text, n_best); + // leave only 4 digits + actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.); + assert_eq!(actual, expected); +} + +#[test] +fn short() { + test("this is a keyword", "en", Config::default(), Some(1), [("keyword", "keyword", 0.1583)]); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn keywords_order_is_preserved() { + // If not, this test becomes unstable. + test( + "Machine learning", + "en", + Config { ngrams: 1, ..Default::default() }, + Some(3), + [("Machine", "machine", 0.1583), ("learning", "learning", 0.1583)], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn laptop() { + test( + "Do you need an Apple laptop?", + "en", + Config { ngrams: 1, ..Default::default() }, + Some(2), + [("Apple", "apple", 0.1448), ("laptop", "laptop", 0.1583)], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn headphones() { + test( + "Do you like headphones? \ + Starting this Saturday, we will be kicking off a huge sale of headphones! \ + If you need headphones, we've got you covered!", + "en", + Config { ngrams: 1, ..Default::default() }, + Some(3), + [("headphones", "headphones", 0.1141), ("Saturday", "saturday", 0.2111), ("Starting", "starting", 0.4096)], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn multi_ngram() { + test( + "I will give you a great deal if you just read this!", + "en", + Config { ngrams: 2, ..Default::default() }, + Some(1), + [("great deal", "great deal", 0.0257)], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn singular() { + test( + // Weird grammar; to compare with the "plural" test + "One smartwatch. One phone. Many phone.", + "en", + Config { ngrams: 1, ..Default::default() }, + Some(2), + [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.2474)], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn plural() { + test( + "One smartwatch. One phone. Many phones.", + "en", + Config { ngrams: 1, ..Default::default() }, + Some(3), + [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.4949), ("phones", "phones", 0.4949)], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn non_hyphenated() { + // For comparison with the "hyphenated" test + test( + "Truly high tech!", + "en", + Config { ngrams: 2, ..Default::default() }, + Some(1), + [("high tech", "high tech", 0.0494)], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn hyphenated() { + test( + "Truly high-tech!", + "en", + Config { ngrams: 2, ..Default::default() }, + Some(1), + [("high-tech", "high-tech", 0.1583)], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn weekly_newsletter_short() { + test( + "This is your weekly newsletter!", + "en", + Config { ngrams: 2, ..Default::default() }, + Some(3), + [ + ("weekly newsletter", "weekly newsletter", 0.0494), + ("newsletter", "newsletter", 0.1583), + ("weekly", "weekly", 0.2974), + ], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn weekly_newsletter_long() { + test( + "This is your weekly newsletter! \ + Hundreds of great deals - everything from men's fashion \ + to high-tech drones!", + "en", + Config { ngrams: 2, ..Default::default() }, + Some(5), + [ + ("weekly newsletter", "weekly newsletter", 0.0780), + ("newsletter", "newsletter", 0.2005), + ("weekly", "weekly", 0.3607), + ("great deals", "great deals", 0.4456), + ("high-tech drones", "high-tech drones", 0.4456), + ], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn weekly_newsletter_long_with_paragraphs() { + test( + "This is your weekly newsletter!\n\n \ + \tHundreds of great deals - everything from men's fashion \n\ + to high-tech drones!", + "en", + Config { ngrams: 2, ..Default::default() }, + Some(5), + [ + ("weekly newsletter", "weekly newsletter", 0.0780), + ("newsletter", "newsletter", 0.2005), + ("weekly", "weekly", 0.3607), + ("great deals", "great deals", 0.4456), + ("high-tech drones", "high-tech drones", 0.4456), + ], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn composite_recurring_words_and_bigger_window() { + test( + "Machine learning is a growing field. Few research fields grow as much as machine learning grows.", + "en", + Config { ngrams: 2, window_size: 2, ..Default::default() }, + Some(5), + [ + ("Machine learning", "machine learning", 0.1346), + ("growing field", "growing field", 0.1672), + ("learning", "learning", 0.2265), + ("Machine", "machine", 0.2341), + ("growing", "growing", 0.2799), + ], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn composite_recurring_words_near_numbers() { + test( + "I buy 100 yellow bananas every day. Every night I eat bananas - all but 5 bananas.", + "en", + Config { ngrams: 2, ..Default::default() }, + Some(3), + [("yellow bananas", "yellow bananas", 0.0682), ("buy", "buy", 0.1428), ("yellow", "yellow", 0.1428)], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn composite_recurring_words_near_spelled_out_numbers() { + // For comparison with "composite_recurring_words_near_numbers" to see if numbers cause + test( + "I buy a hundred yellow bananas every day. Every night I eat bananas - all but five bananas.", + "en", + Config { ngrams: 2, ..Default::default() }, + Some(3), + [ + ("hundred yellow", "hundred yellow", 0.0446), + ("yellow bananas", "yellow bananas", 0.1017), + ("day", "day", 0.1428), + ], + ); + // Results agree with reference implementation LIAAD/yake +} + +#[test] +fn with_stopword_in_the_middle() { + test( + "Game of Thrones", + "en", + Config { remove_duplicates: false, ..Config::default() }, + Some(1), + [("Game of Thrones", "game of thrones", 0.01380)], + ); + // Results agree with reference implementation LIAAD/yake +} + +mod liaad_yake_samples { + use super::*; + + #[test] + fn google_sample_single_ngram() { + // LIAAD/yake sample text + test( + include_str!("test_google.txt"), + "en", + Config { ngrams: 1, ..Default::default() }, + Some(10), + [ + ("Google", "google", 0.0251), + ("Kaggle", "kaggle", 0.0273), + ("data", "data", 0.08), + ("science", "science", 0.0983), + ("platform", "platform", 0.124), + ("service", "service", 0.1316), + ("acquiring", "acquiring", 0.1511), + ("learning", "learning", 0.1621), + ("Goldbloom", "goldbloom", 0.1625), + ("machine", "machine", 0.1672), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn google_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_google.txt"), + "en", + Config::default(), + Some(10), + [ + ("Google", "google", 0.0251), + ("Kaggle", "kaggle", 0.0273), + ("CEO Anthony Goldbloom", "ceo anthony goldbloom", 0.0483), + ("data science", "data science", 0.055), + ("acquiring data science", "acquiring data science", 0.0603), + ("Google Cloud Platform", "google cloud platform", 0.0746), + ("data", "data", 0.08), + ("San Francisco", "san francisco", 0.0914), + ("Anthony Goldbloom declined", "anthony goldbloom declined", 0.0974), + ("science", "science", 0.0983), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn gitter_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_gitter.txt"), + "en", + Config::default(), + Some(10), + [ + ("Gitter", "gitter", 0.0190), + ("GitLab", "gitlab", 0.0478), + ("acquires software chat", "acquires software chat", 0.0479), + ("chat startup Gitter", "chat startup gitter", 0.0512), + ("software chat startup", "software chat startup", 0.0612), + ("Gitter chat", "gitter chat", 0.0684), + ("GitLab acquires software", "gitlab acquires software", 0.0685), + ("startup", "startup", 0.0783), + ("software", "software", 0.0879), + ("code", "code", 0.0879), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn genius_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_genius.txt"), + "en", + Config::default(), + Some(10), + [ + ("Genius", "genius", 0.0261), + ("company", "company", 0.0263), + ("Genius quietly laid", "genius quietly laid", 0.027), + ("company quietly laid", "company quietly laid", 0.0392), + ("media company", "media company", 0.0404), + ("Lehman", "lehman", 0.0412), + ("quietly laid", "quietly laid", 0.0583), + ("Tom Lehman told", "tom lehman told", 0.0603), + ("video", "video", 0.0650), + ("co-founder Tom Lehman", "co-founder tom lehman", 0.0669), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn german_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_german.txt"), + "de", + Config::default(), + Some(10), + [ + ("Vereinigten Staaten", "vereinigten staaten", 0.0152), // LIAAD REFERENCE: 0.151 + ("Präsidenten Donald Trump", "präsidenten donald trump", 0.0182), + ("Donald Trump", "donald trump", 0.0211), // LIAAD REFERENCE: 0.21 + ("trifft Donald Trump", "trifft donald trump", 0.0231), // LIAAD REFERENCE: 0.23 + ("Trump", "trump", 0.0240), + ("Trumps Finanzminister Steven", "trumps finanzminister steven", 0.0243), + ("Kanzlerin Angela Merkel", "kanzlerin angela merkel", 0.0275), // LIAAD REFERENCE: 0.273 + ("deutsche Kanzlerin Angela", "deutsche kanzlerin angela", 0.0316), // LIAAD REFERENCE: 0.314 + ("Merkel trifft Donald", "merkel trifft donald", 0.0353), // LIAAD REFERENCE: 0.351 + ("Exportnation Deutschland", "exportnation deutschland", 0.038), // LIAAD REFERENCE: 0.0379 + ], + ); + // REASONS FOR DISCREPANCY: + // - The text contains both "bereit" ("ready") and "bereits" ("already"). + // While "bereits" is a stopword, "bereit" is not. + // LIAAD/yake keeps track of whether a term is a stopword or not + // in a key-value mapping, where the key is the term, lowercase, plural-normalized. + // (Note that the plural normalization techique used is rarely effective in German.) + // Since "bereits" occurs before "bereit" in the text, LIAAD/yake sees it, + // recognizes it is a stopword, and stores it under the key "bereit". Later, + // when it encounters "bereit" (NOT a stopword), it already has that key in its + // mapping so it looks it up and finds that it is a keyword (which it is not). + // Meanwhile, yake-rust does not have such a key-value store, so it correctly + // recognizes "bereits" as a stopword and "bereit" as a non-stopword. The extra + // inclusion of "bereit" in the non-stopwords affects the TF statistics and thus + // the frequency contribution to the weights, leading to slightly different scores. + } + + #[test] + fn dutch_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_nl.txt"), + "nl", + Config::default(), + Some(10), + [ + ("Vincent van Gogh", "vincent van gogh", 0.0111), + ("Gogh Museum", "gogh museum", 0.0125), + ("Gogh", "gogh", 0.0150), + ("Museum", "museum", 0.0438), + ("brieven", "brieven", 0.0635), + ("Vincent", "vincent", 0.0643), + ("Goghs schilderijen", "goghs schilderijen", 0.1009), + ("Gogh verging", "gogh verging", 0.1215), + ("Goghs", "goghs", 0.1651), + ("schrijven", "schrijven", 0.1704), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn finnish_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_fi.txt"), + "fi", + Config::default(), + Some(10), + [ + ("Mobile Networks", "mobile networks", 0.0043), + ("Nokia tekee muutoksia", "nokia tekee muutoksia", 0.0061), + ("tekee muutoksia organisaatioonsa", "tekee muutoksia organisaatioonsa", 0.0065), + ("johtokuntaansa vauhdittaakseen yhtiön", "johtokuntaansa vauhdittaakseen yhtiön", 0.0088), + ("vauhdittaakseen yhtiön strategian", "vauhdittaakseen yhtiön strategian", 0.0088), + ("yhtiön strategian toteuttamista", "yhtiön strategian toteuttamista", 0.0092), + ("Networks", "networks", 0.0102), + ("Networks and Applications", "networks and applications", 0.0113), + ("strategian toteuttamista Nokia", "strategian toteuttamista nokia", 0.0127), + ("siirtyy Mobile Networks", "siirtyy mobile networks", 0.0130), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn italian_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_it.txt"), + "it", + Config::default(), + Some(5), + [ + ("Champions League", "champions league", 0.0390), + ("Quarti", "quarti", 0.0520), + ("Atlético Madrid", "atlético madrid", 0.0592), + ("Ottavi di finale", "ottavi di finale", 0.0646), + ("Real Madrid", "real madrid", 0.0701), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn french_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_fr.txt"), + "fr", + Config::default(), + Some(10), + [ + ("dégrade en France", "dégrade en france", 0.0254), + ("jusque-là uniquement associée", "jusque-là uniquement associée", 0.0504), + ("sondage Ifop réalisé", "sondage ifop réalisé", 0.0554), + ("religion se dégrade", "religion se dégrade", 0.091), + ("France", "france", 0.0941), + ("l'extrême droite", "l'extrême droite", 0.0997), + ("sondage Ifop", "sondage ifop", 0.101), + ("Islam", "islam", 0.1021), + ("musulmane en France", "musulmane en france", 0.1078), + ("Allemagne", "allemagne", 0.1086), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + #[ignore = "Crashes due to failed unwrap"] + fn portuguese_sport_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_pt_1.txt"), + "pt", + Config::default(), + Some(10), + [ + ("seleção brasileira treinará", "seleção brasileira treinará", 0.0072), + ("seleção brasileira", "seleção brasileira", 0.0100), + ("Seleção Brasileira visando", "seleção brasileira visando", 0.0192), + ("Seleção Brasileira encara", "seleção brasileira encara", 0.0344), + ("brasileira treinará", "brasileira treinará", 0.0373), + ("Renato Augusto", "renato augusto", 0.0376), + ("Copa da Rússia", "copa da rússia", 0.0407), + ("seleção", "seleção", 0.0454), + ("brasileira", "brasileira", 0.0528), + ], + ); + } + + #[test] + fn portuguese_tourism_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_pt_2.txt"), + "pt", + Config::default(), + Some(10), + [ + ("Alvor", "alvor", 0.0165), + ("Rio Alvor", "rio alvor", 0.0336), + ("Ria de Alvor", "ria de alvor", 0.0488), + ("encantadora vila", "encantadora vila", 0.0575), + ("Algarve", "algarve", 0.0774), + ("impressionantes de Portugal", "impressionantes de portugal", 0.0844), + ("estuário do Rio", "estuário do rio", 0.0907), + ("vila", "vila", 0.1017), + ("Ria", "ria", 0.1053), + ("Oceano Atlântico", "oceano atlântico", 0.1357), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn spanish_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_es.txt"), + "es", + Config::default(), + Some(10), + [ + ("Guerra Civil Española", "guerra civil española", 0.0032), + ("Guerra Civil", "guerra civil", 0.0130), + ("Civil Española", "civil española", 0.0153), + ("Partido Socialista Obrero", "partido socialista obrero", 0.0283), + ("empezó la Guerra", "empezó la guerra", 0.0333), + ("Socialista Obrero Español", "socialista obrero español", 0.0411), + ("José Castillo", "josé castillo", 0.0426), + ("Española", "española", 0.0566), + ("José Antonio Primo", "josé antonio primo", 0.0589), + ("José Calvo Sotelo", "josé calvo sotelo", 0.0596), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn polish_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_pl.txt"), + "pl", + Config::default(), + Some(10), + [ + ("franka", "franka", 0.0328), + ("Geerta Wildersa VVD", "geerta wildersa vvd", 0.0346), + ("Geerta Wildersa", "geerta wildersa", 0.0399), + ("kurs franka", "kurs franka", 0.0486), + ("partii Geerta Wildersa", "partii geerta wildersa", 0.0675), + ("proc", "proc", 0.0692), + ("mld", "mld", 0.0724), + ("Narodowego Banku Szwajcarii", "narodowego banku szwajcarii", 0.0728), + ("kurs franka poniżej", "kurs franka poniżej", 0.0758), + ("Wildersa", "wildersa", 0.0765), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn turkish_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_tr.txt"), + "tr", + Config::default(), + Some(10), + [ + ("OECD", "oecd", 0.0176), // LIAAD REFERENCE: 0.0178 + ("Tek Bakışta Eğitim", "tek bakışta eğitim", 0.0232), // LIAAD REFERENCE: 0.0236 + ("eğitim", "eğitim", 0.0274), // LIAAD REFERENCE: 0.0278 + ("OECD eğitim endeksi", "oecd eğitim endeksi", 0.0313), // LIAAD REFERENCE: 0.0323 + ("OECD ortalamasının", "oecd ortalamasının", 0.0375), // LIAAD REFERNENCE: 0.0383 + ("Kalkınma Örgütü'nün", "kalkınma örgütü'nün", 0.0449), // LIAAD REFERENCE: 0.045 + ("Tek Bakışta", "tek bakışta", 0.0449), // LIAAD REFERENCE: 0.045 + ("İşbirliği ve Kalkınma", "i̇şbirliği ve kalkınma", 0.0468), + ("Türkiye'de", "türkiye'de", 0.0476), // LIAAD REFERENCE: 0.0480 + ("yüksek", "yüksek", 0.0509), // LIAAD REFERENCE: 0.0513 + ], + ); + // REASONS FOR DISCREPANCY: + // - Difference in tokenization. + } + + #[test] + fn arabic_sample_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_ar.txt"), + "ar", + Config::default(), + Some(10), + [ + ("عبد السلام العجيلي", "عبد السلام العجيلي", 0.0105), + ("اللغة العربية الأربعاء", "اللغة العربية الأربعاء", 0.0139), + ("عبد النبي اصطيف", "عبد النبي اصطيف", 0.0142), + ("العجيلي في مرآة", "العجيلي في مرآة", 0.0177), + ("مرآة النقد المقارن", "مرآة النقد المقارن", 0.0183), // LIAAD REFERENCE: 0.018 + ("السلام العجيلي", "السلام العجيلي", 0.0198), + ("اللغة العربية", "اللغة العربية", 0.0207), + ("مرآة النقد", "مرآة النقد", 0.0255), // LIAAD REFERENCE: 0.025 + ("اللغة العربية بدمشق", "اللغة العربية بدمشق", 0.0261), + ("مجمع اللغة العربية", "مجمع اللغة العربية", 0.0281), + ], + ); + } + + #[test] + fn dataset_text_1_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_data_1.txt"), + "pt", + Config::default(), + Some(10), + [ + ("Médio Oriente continua", "médio oriente continua", 0.0008), + ("Médio Oriente", "médio oriente", 0.0045), + ("Oriente continua", "oriente continua", 0.0117), + ("registar-se violentos confrontos", "registar-se violentos confrontos", 0.0178), + ("Faixa de Gaza", "faixa de gaza", 0.0268), + ("fogo hoje voltaram", "fogo hoje voltaram", 0.0311), + ("voltaram a registar-se", "voltaram a registar-se", 0.0311), + ("registar-se violentos", "registar-se violentos", 0.0311), + ("Exército israelita", "exército israelita", 0.0368), + ("Exército israelita voltou", "exército israelita voltou", 0.0639), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn dataset_text_2_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_data_2.txt"), + "en", + Config::default(), + Some(5), + [ + ("highly radioactive water", "highly radioactive water", 0.0006), + ("crippled nuclear plant", "crippled nuclear plant", 0.0006), + ("ocean Japan official", "ocean japan official", 0.0031), + ("Japan official", "japan official", 0.0046), + ("official says highly", "official says highly", 0.0050), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn dataset_text_3_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_data_3.txt"), + "en", + Config::default(), + Some(5), + [ + ("Global Crossing", "global crossing", 0.0034), + ("Hutchison Telecommunications", "hutchison telecommunications", 0.0053), + ("Telecommunications and Singapore", "telecommunications and singapore", 0.0072), + ("Singapore Technologies", "singapore technologies", 0.0072), + ("Technologies take control", "technologies take control", 0.0157), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn dataset_text_4_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_data_4.txt"), + "en", + Config::default(), + Some(10), + [ + ("annual revenues increasing", "annual revenues increasing", 0.0018), + ("retail inventory management", "retail inventory management", 0.0023), + ("Dollar General", "dollar general", 0.0034), + ("inventory management", "inventory management", 0.0112), + ("perpetual progress", "perpetual progress", 0.0133), + ("revenues increasing", "revenues increasing", 0.0133), + ("fast track", "fast track", 0.0133), + ("road to perpetual", "road to perpetual", 0.0159), + ("annual revenues", "annual revenues", 0.0168), + ("stores opened", "stores opened", 0.0168), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn dataset_text_5_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_data_5.txt"), + "en", + Config::default(), + Some(10), + [ + ("Handoff Trigger Table", "handoff trigger table", 0.0006), // LIAAD REFERENCE: 0.0007 + ("Handoff", "handoff", 0.0010), + ("WLAN Networks ABSTRACT", "wlan networks abstract", 0.0019), + ("Vertical handoff", "vertical handoff", 0.0020), + ("Handoff Trigger", "handoff trigger", 0.0021), + ("proactive handoff scheme", "proactive handoff scheme", 0.0021), + ("HTT Method Figure", "htt method figure", 0.0022), + ("WLAN", "wlan", 0.0023), + ("ABSTRACT Vertical handoff", "abstract vertical handoff", 0.0030), + ("traditional handoff scheme", "traditional handoff scheme", 0.0033), + ], + ); + // REASONS FOR DISCREPANCY: + // - Difference in sentence splitting. + } + + #[test] + fn dataset_text_6_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_data_6.txt"), + "en", + Config::default(), + Some(10), + [ + ("MRSA", "mrsa", 0.0047), + ("TSN Database", "tsn database", 0.0107), + ("methicillin-resistant Staphylococcus aureus", "methicillin-resistant staphylococcus aureus", 0.0116), + ("rates of MRSA", "rates of mrsa", 0.0145), + ("Staphylococcus aureus", "staphylococcus aureus", 0.0167), + ("methicillin-resistant Staphylococcus", "methicillin-resistant staphylococcus", 0.0177), + ("prevalence of MRSA", "prevalence of mrsa", 0.0201), + ("MRSA infections", "mrsa infections", 0.0218), + ("MRSA infections detected", "mrsa infections detected", 0.0223), + ("TSN", "tsn", 0.0250), + ], + ); + // Results agree with reference implementation LIAAD/yake + } + + #[test] + fn dataset_text_7_defaults() { + // LIAAD/yake sample text + test( + include_str!("test_data_7.txt"), + "en", + Config::default(), + Some(10), + [ + ("Environment Design Level", "environment design level", 0.0008), + ("Jerusalem Jerusalem", "jerusalem jerusalem", 0.0009), + ("Dynamics Based Control", "dynamics based control", 0.0011), + ("system dynamics", "system dynamics", 0.0017), + ("DBC", "dbc", 0.0019), + ("target system dynamics", "target system dynamics", 0.0019), + ("target dynamics", "target dynamics", 0.0023), + ("Science Bar Ilan", "science bar ilan", 0.0025), + ("EMT", "emt", 0.0026), + ("Dynamics", "dynamics", 0.0026), + ], + ); + // Results agree with reference implementation LIAAD/yake + } +}