diff --git a/Cargo.lock b/Cargo.lock
index 74f6d47..a6480bd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -28,12 +28,56 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "anstream"
+version = "0.6.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "is_terminal_polyfill",
+ "utf8parse",
+]
+
 [[package]]
 name = "anstyle"
 version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
 
+[[package]]
+name = "anstyle-parse"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
+dependencies = [
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e"
+dependencies = [
+ "anstyle",
+ "once_cell",
+ "windows-sys 0.59.0",
+]
+
 [[package]]
 name = "arbitrary"
 version = "1.4.1"
@@ -150,30 +194,51 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.5.26"
+version = "4.5.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783"
+checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796"
 dependencies = [
  "clap_builder",
+ "clap_derive",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.5.26"
+version = "4.5.27"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121"
+checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7"
 dependencies = [
+ "anstream",
  "anstyle",
  "clap_lex",
+ "strsim",
  "terminal_size",
 ]
 
+[[package]]
+name = "clap_derive"
+version = "4.5.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54b755194d6389280185988721fffba69495eed5ee9feeee9a599b53db80318c"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "clap_lex"
 version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
 
+[[package]]
+name = "colorchoice"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
+
 [[package]]
 name = "condtype"
 version = "1.3.0"
@@ -249,6 +314,27 @@ dependencies = [
  "typenum",
 ]
 
+[[package]]
+name = "csv"
+version = "1.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
+dependencies = [
+ "memchr",
+]
+
 [[package]]
 name = "deflate64"
 version = "0.1.9"
@@ -292,6 +378,27 @@ dependencies = [
  "subtle",
 ]
 
+[[package]]
+name = "dirs-next"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1"
+dependencies = [
+ "cfg-if",
+ "dirs-sys-next",
+]
+
+[[package]]
+name = "dirs-sys-next"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d"
+dependencies = [
+ "libc",
+ "redox_users",
+ "winapi",
+]
+
 [[package]]
 name = "displaydoc"
 version = "0.2.5"
@@ -334,6 +441,12 @@ version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
 
+[[package]]
+name = "encode_unicode"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
+
 [[package]]
 name = "envmnt"
 version = "0.8.4"
@@ -357,9 +470,15 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
 dependencies = [
  "libc",
- "windows-sys",
+ "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "exit-code"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf4cdb977193f2d7688525ca10d86199fe9bdd9db26b97ef490558128c643dc4"
+
 [[package]]
 name = "fancy-regex"
 version = "0.14.0"
@@ -429,6 +548,18 @@ version = "0.15.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
 
+[[package]]
+name = "heck"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
+
+[[package]]
+name = "hermit-abi"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc"
+
 [[package]]
 name = "hmac"
 version = "0.12.1"
@@ -462,9 +593,9 @@ dependencies = [
 
 [[package]]
 name = "indexmap"
-version = "2.7.0"
+version = "2.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f"
+checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652"
 dependencies = [
  "equivalent",
  "hashbrown 0.15.2",
@@ -479,6 +610,23 @@ dependencies = [
  "generic-array",
 ]
 
+[[package]]
+name = "is-terminal"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "itertools"
 version = "0.14.0"
@@ -503,6 +651,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
 [[package]]
 name = "levenshtein"
 version = "1.0.5"
@@ -515,6 +669,16 @@ version = "0.2.169"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
 
+[[package]]
+name = "libredox"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d"
+dependencies = [
+ "bitflags",
+ "libc",
+]
+
 [[package]]
 name = "linked-hash-map"
 version = "0.5.6"
@@ -647,6 +811,20 @@ dependencies = [
  "yansi",
 ]
 
+[[package]]
+name = "prettytable-rs"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eea25e07510aa6ab6547308ebe3c036016d162b8da920dbb079e3ba8acf3d95a"
+dependencies = [
+ "csv",
+ "encode_unicode",
+ "is-terminal",
+ "lazy_static",
+ "term",
+ "unicode-width",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.93"
@@ -695,6 +873,17 @@ dependencies = [
  "getrandom",
 ]
 
+[[package]]
+name = "redox_users"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43"
+dependencies = [
+ "getrandom",
+ "libredox",
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "regex"
 version = "1.11.1"
@@ -740,9 +929,15 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys",
- "windows-sys",
+ "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "rustversion"
+version = "1.0.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4"
+
 [[package]]
 name = "rusty-hook"
 version = "0.11.2"
@@ -794,9 +989,9 @@ dependencies = [
 
 [[package]]
 name = "serde_json"
-version = "1.0.135"
+version = "1.0.137"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9"
+checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b"
 dependencies = [
  "itoa",
  "memchr",
@@ -846,6 +1041,12 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "subtle"
 version = "2.6.1"
@@ -863,6 +1064,17 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "term"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59df8ac95d96ff9bede18eb7300b0fda5e5d8d90960e76f8e14ae765eedbf1f"
+dependencies = [
+ "dirs-next",
+ "rustversion",
+ "winapi",
+]
+
 [[package]]
 name = "terminal_size"
 version = "0.4.1"
@@ -870,7 +1082,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5352447f921fda68cf61b4101566c0bdb5104eff6804d0678e5227580ab6a4e9"
 dependencies = [
  "rustix",
- "windows-sys",
+ "windows-sys 0.59.0",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
+dependencies = [
+ "thiserror-impl 1.0.69",
 ]
 
 [[package]]
@@ -879,7 +1100,18 @@ version = "2.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
 dependencies = [
- "thiserror-impl",
+ "thiserror-impl 2.0.11",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
 ]
 
 [[package]]
@@ -939,6 +1171,12 @@ version = "0.1.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
 
+[[package]]
+name = "utf8parse"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
+
 [[package]]
 name = "version_check"
 version = "0.9.5"
@@ -951,6 +1189,37 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.59.0"
@@ -1024,18 +1293,30 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
+[[package]]
+name = "yake"
+version = "0.1.0"
+dependencies = [
+ "clap",
+ "exit-code",
+ "prettytable-rs",
+ "serde_json",
+ "yake-rust",
+]
+
 [[package]]
 name = "yake-rust"
 version = "0.2.0"
 dependencies = [
  "contractions",
  "divan",
- "indexmap 2.7.0",
+ "indexmap 2.7.1",
  "levenshtein",
  "pretty_assertions",
  "regex",
  "rusty-hook",
  "segtok",
+ "serde",
  "streaming-stats",
  "zip",
 ]
@@ -1103,13 +1384,13 @@ dependencies = [
  "displaydoc",
  "flate2",
  "hmac",
- "indexmap 2.7.0",
+ "indexmap 2.7.1",
  "lzma-rs",
  "memchr",
  "pbkdf2",
  "rand",
  "sha1",
- "thiserror",
+ "thiserror 2.0.11",
  "time",
  "zeroize",
  "zopfli",
diff --git a/Cargo.toml b/Cargo.toml
index cd4b45c..ff5bfe9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,8 +1,6 @@
 [workspace]
 resolver = "2"
-members = [
-    "yake_rust"
-]
+members = ["yake", "yake_rust"]
 
 [workspace.dependencies]
 rusty-hook = "0.11.2"
diff --git a/README.md b/README.md
index 62846fa..23bb9b8 100644
--- a/README.md
+++ b/README.md
@@ -92,3 +92,33 @@ Results:
 | learning  | learning  | 0.1621 |
 | goldbloom | Goldbloom | 0.1625 |
 | machine   | machine   | 0.1672 |
+
+
+### CLI
+`yake` is the CLI implementation of `yake_rust`
+
+#### Basic usage
+```shell
+$ cargo install --path yake
+$ yake --input-file yake_rust/src/test_google.txt
+```
+#### More options
+
+```shell
+$ yake --help
+
+Usage: yake [OPTIONS] <--text-input <TEXT>|--input-file <FILE>>
+
+Options:
+      --text-input <TEXT>      Input text, SURROUNDED by single quotes(')
+  -i, --input-file <FILE>      Input file
+  -n, --ngram-size <INTEGER>   Max size of the ngram [default: 3]
+      --dedup-lim <FLOAT>      Deduplication limiter [default: 0.9]
+      --window-size <INTEGER>  Window size [default: 1]
+  -t, --top <INTEGER>          Number of keyphrases to extract
+  -v, --verbose                Gets detailed information (such as the score)
+  -l, --language <TEXT>        Language [default: en]
+      --json                   Dump output as JSON
+  -h, --help                   Print help
+  -V, --version                Print version
+```
\ No newline at end of file
diff --git a/yake/Cargo.toml b/yake/Cargo.toml
new file mode 100644
index 0000000..77d72d0
--- /dev/null
+++ b/yake/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "yake"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+yake-rust = { path = "../yake_rust" , features = ["serde"]}
+clap = { version = "4.5.26", features = ["cargo", "derive", "string"] }
+serde_json = "1.0.135"
+exit-code = "1.0.0"
+prettytable-rs = "0.10.0"
diff --git a/yake/src/cli.rs b/yake/src/cli.rs
new file mode 100644
index 0000000..6e91d77
--- /dev/null
+++ b/yake/src/cli.rs
@@ -0,0 +1,134 @@
+use std::{path::PathBuf, sync::LazyLock};
+
+use clap::error::ErrorKind;
+use clap::{command, Args};
+use clap::{CommandFactory, Parser};
+use yake_rust::{Config, StopWords};
+
+static DEFAULT_CONFIG: LazyLock<Config> = LazyLock::new(Config::default);
+
+#[derive(Args)]
+#[group(required = true, multiple = false)]
+struct Input {
+    // -ti, --text_input TEXT
+    /// Input text
+    #[arg(
+        conflicts_with = "input_file",
+        long,
+        help = "Input text, SURROUNDED by single quotes(')",
+        value_name = "TEXT"
+    )]
+    text_input: Option<String>,
+
+    // -i, --input_file TEXT
+    /// Input file
+    #[arg(conflicts_with = "text_input", short, long, help = "Input file", value_name = "FILE")]
+    input_file: Option<PathBuf>,
+}
+
+// TODO
+// -df, --dedup-func [leve|jaro|seqm]
+// Deduplication function.
+
+#[derive(Parser)]
+#[command(version, about, long_about = None)]
+struct Cli {
+    #[command(flatten)]
+    input: Input,
+
+    // -n, --ngram-size INTEGER
+    /// Max size of the ngram
+    #[arg(short, long, default_value_t = DEFAULT_CONFIG.ngrams, help = "Max size of the ngram", value_name = "INTEGER")]
+    ngram_size: usize,
+
+    // -dl, --dedup-lim FLOAT
+    /// Deduplication limiter
+    #[arg(long, value_parser = parse_dedup, default_value_t = DEFAULT_CONFIG.deduplication_threshold, help = "Deduplication limiter", value_name = "FLOAT")]
+    dedup_lim: f64,
+
+    // -ws, --window-size INTEGER
+    /// Window size
+    #[arg(long, default_value_t = DEFAULT_CONFIG.window_size, help = "Window size", value_name = "INTEGER")]
+    window_size: usize,
+
+    // -t, --top INTEGER
+    /// Number of keyphrases to extract
+    #[arg(short, long, help = "Number of keyphrases to extract", value_name = "INTEGER")]
+    top: Option<usize>,
+
+    // -v, --verbose
+    /// Gets detailed information (such as the score)
+    #[arg(short, long, help = "Gets detailed information (such as the score)")]
+    verbose: bool,
+
+    // // --help
+    // /// Show this message and exit
+    // #[arg(short, long)]
+    // help: bool,
+
+    // -l, --language TEXT
+    /// Language
+    #[arg(short, long, default_value= "en", value_parser = parse_language, help = "Language", value_name = "TEXT")]
+    language: StopWords,
+
+    #[arg(long, help = "Dump output as JSON")]
+    json: bool,
+}
+
+fn parse_language(cli_language: &str) -> Result<StopWords, String> {
+    StopWords::predefined(cli_language)
+        .ok_or_else(|| format!("Could not find language {}, did you enable this feature?", cli_language))
+}
+
+fn parse_dedup(cli_dedup_lim: &str) -> Result<f64, String> {
+    match cli_dedup_lim.parse::<f64>() {
+        Ok(value @ 0f64..=1f64) => Ok(value),
+        Ok(value) => Err(format!("{} is not in the 0..=1", value)),
+        Err(_) => Err("invalid digit found in string".into()),
+    }
+}
+
+pub struct ParsedCli {
+    pub config: Config,
+    pub language: StopWords,
+    pub input: String,
+    pub json: bool,
+    pub top: Option<usize>,
+    pub verbose: bool,
+}
+
+pub fn parse_cli() -> ParsedCli {
+    let cli = Cli::parse();
+
+    let input = match (cli.input.text_input, cli.input.input_file) {
+        (None, None) | (Some(_), Some(_)) => {
+            panic!("clap should ensure that either text-input or input-file is specified")
+        }
+        (None, Some(path_to_file)) => match std::fs::read_to_string(&path_to_file) {
+            Ok(text) => text,
+            Err(err) => {
+                Cli::command()
+                    .error(
+                        ErrorKind::ValueValidation,
+                        format!("Error reading file `{}`: {:?}", path_to_file.display(), err),
+                    )
+                    .exit();
+            }
+        },
+        (Some(text), None) => text,
+    };
+
+    ParsedCli {
+        config: Config {
+            ngrams: cli.ngram_size,
+            window_size: cli.window_size,
+            deduplication_threshold: cli.dedup_lim,
+            ..Config::default()
+        },
+        language: cli.language,
+        input,
+        json: cli.json,
+        verbose: cli.verbose,
+        top: cli.top,
+    }
+}
diff --git a/yake/src/main.rs b/yake/src/main.rs
new file mode 100644
index 0000000..0f1b75f
--- /dev/null
+++ b/yake/src/main.rs
@@ -0,0 +1,64 @@
+use cli::{parse_cli, ParsedCli};
+use prettytable::{format, row, Table};
+use yake_rust::{ResultItem, Yake};
+
+mod cli;
+
+fn main() {
+    let ParsedCli { language, json, input, config, top, verbose } = parse_cli();
+
+    let now = std::time::Instant::now();
+
+    let keywords = Yake::new(language, config).get_n_best(&input, top);
+
+    output_keywords(&keywords, json, verbose);
+    if verbose {
+        eprintln!("Elapsed: {:.2?}", now.elapsed());
+    }
+}
+
+fn output_keywords(keywords: &[ResultItem], json: bool, verbose: bool) {
+    match (json, verbose) {
+        (true, _) => {
+            output_keywords_json(&keywords);
+        }
+        (false, true) => {
+            output_keywords_verbose(&keywords);
+        }
+        (false, false) => {
+            output_keywords_simple(&keywords);
+        }
+    }
+}
+
+fn output_keywords_verbose(keywords: &[ResultItem]) {
+    let mut table = Table::new();
+    table.set_titles(row!["keyword", "raw", "score"]);
+    for keyword in keywords {
+        table.add_row(row![keyword.keyword, keyword.raw, format!("{:.4}", keyword.score)]);
+    }
+    table.set_format(*format::consts::FORMAT_NO_BORDER_LINE_SEPARATOR);
+    table.printstd()
+}
+
+fn output_keywords_simple(keywords: &[ResultItem]) {
+    let mut table = Table::new();
+    table.set_titles(row!["keyword"]);
+    for keyword in keywords {
+        table.add_row(row![keyword.keyword]);
+    }
+    table.set_format(*format::consts::FORMAT_NO_BORDER_LINE_SEPARATOR);
+    table.printstd()
+}
+
+fn output_keywords_json(keywords: &[ResultItem]) {
+    match serde_json::to_string(&keywords) {
+        Ok(str) => {
+            println!("{}", str)
+        }
+        Err(e) => {
+            eprintln!("Unexpected error happened while trying to serialize result to json : {:?}", e);
+            std::process::exit(exit_code::SOFTWARE_ERROR)
+        }
+    }
+}
diff --git a/yake_rust/Cargo.toml b/yake_rust/Cargo.toml
index 75e7711..789b86a 100644
--- a/yake_rust/Cargo.toml
+++ b/yake_rust/Cargo.toml
@@ -58,6 +58,7 @@ contractions = "0.5.4"
 segtok = "0.1.2"
 levenshtein = "1.0.5"
 indexmap = "2.7.0"
+serde = { version = "1.0.217", optional = true }
 
 [dev-dependencies]
 divan = "0.1.17"
diff --git a/yake_rust/src/context.rs b/yake_rust/src/context.rs
new file mode 100644
index 0000000..0f79e46
--- /dev/null
+++ b/yake_rust/src/context.rs
@@ -0,0 +1,44 @@
+use std::collections::HashMap;
+
+use crate::counter::Counter;
+use crate::UTerm;
+
+/// Stats for a single term `T` against another terms.
+#[derive(Default)]
+pub struct PairwiseFreq<'s> {
+    /// How often `T` stands after: `A..T`
+    follows: Counter<&'s UTerm>,
+    /// How often `T` stands before: `T..A`
+    followed_by: Counter<&'s UTerm>,
+}
+
+#[derive(Default)]
+pub struct Contexts<'s> {
+    map: HashMap<&'s UTerm, PairwiseFreq<'s>>,
+}
+
+impl<'s> Contexts<'s> {
+    pub fn track(&mut self, left: &'s UTerm, right: &'s UTerm) {
+        self.map.entry(right).or_default().follows.inc(left);
+        self.map.entry(left).or_default().followed_by.inc(right);
+    }
+
+    /// The total number of cases where `term` stands on the left side of `by`: `term .. by`
+    pub fn cases_term_is_followed(&self, term: &'s UTerm, by: &'s UTerm) -> usize {
+        self.map.get(&term).unwrap().followed_by.get(&by)
+    }
+
+    /// Value showing how divergent the surrounding of a term is.
+    /// The term may appear many times with the same words around, which means it's a fixed expression.
+    ///
+    /// `0` is fixed, `1` is divergent.
+    pub fn diversity_of(&self, term: &'s UTerm) -> (f64, f64) {
+        match self.map.get(&term) {
+            None => (0., 0.),
+            Some(PairwiseFreq { follows: leftward, followed_by: rightward }) => (
+                if leftward.is_empty() { 0. } else { leftward.distinct() as f64 / leftward.total() as f64 },
+                if rightward.is_empty() { 0. } else { rightward.distinct() as f64 / rightward.total() as f64 },
+            ),
+        }
+    }
+}
diff --git a/yake_rust/src/lib.rs b/yake_rust/src/lib.rs
index df60a26..2a81a0c 100644
--- a/yake_rust/src/lib.rs
+++ b/yake_rust/src/lib.rs
@@ -7,12 +7,15 @@ use std::iter::FromIterator;
 use indexmap::{IndexMap, IndexSet};
 use plural_helper::PluralHelper;
 use preprocessor::{split_into_sentences, split_into_words};
+#[cfg(feature = "serde")]
+use serde;
 use stats::{mean, median, stddev};
 
-use crate::counter::Counter;
+use crate::context::Contexts;
 use crate::levenshtein::levenshtein_ratio;
 pub use crate::stopwords::StopWords;
 
+mod context;
 mod counter;
 mod levenshtein;
 mod plural_helper;
@@ -32,7 +35,6 @@ type Sentences = Vec<Sentence>;
 type Candidates<'s> = IndexMap<&'s [LTerm], Candidate<'s>>;
 type Features<'s> = HashMap<&'s LTerm, TermStats>;
 type Words<'s> = HashMap<&'s UTerm, Vec<Occurrence<'s>>>;
-type Contexts<'s> = HashMap<&'s UTerm, (Counter<&'s UTerm>, Counter<&'s UTerm>)>;
 
 #[derive(Debug, Copy, Clone, Eq, PartialEq)]
 enum Tag {
@@ -92,10 +94,6 @@ struct TermStats {
     position: f64,
     /// Normalized term frequency heuristic
     frequency: f64,
-    /// Left dispersion
-    dl: f64,
-    /// Right dispersion
-    dr: f64,
     /// Term relatedness to context
     relatedness: f64,
     /// Term's different sentences heuristic
@@ -105,6 +103,7 @@ struct TermStats {
 }
 
 #[derive(PartialEq, Clone, Debug)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
 pub struct ResultItem {
     pub raw: String,
     pub keyword: LTerm,
@@ -135,6 +134,7 @@ struct Candidate<'s> {
 }
 
 #[derive(Debug, Clone)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
 pub struct Config {
     /// The number of n-grams.
     ///
@@ -153,6 +153,11 @@ pub struct Config {
     ///
     /// The [original implementation](https://github.com/LIAAD/) sticks with `true`.
     pub strict_capital: bool,
+
+    /// When `true`, key phrases are allowed to have only alphanumeric characters and hyphen.
+    pub only_alphanumeric_and_hyphen: bool,
+    /// Key phrases can't be too short, less than `minimum_chars` in total.
+    pub minimum_chars: usize,
 }
 
 impl Default for Config {
@@ -164,11 +169,14 @@ impl Default for Config {
             ngrams: 3,
             remove_duplicates: true,
             strict_capital: true,
+            only_alphanumeric_and_hyphen: false,
+            minimum_chars: 3,
         }
     }
 }
 
 #[derive(Debug, Clone)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
 pub struct Yake {
     config: Config,
     stop_words: StopWords,
@@ -187,7 +195,6 @@ impl Yake {
         let features = self.extract_features(&context, vocabulary, &sentences);
 
         let mut ngrams: Candidates = self.ngram_selection(self.config.ngrams, &sentences);
-        self.filter_candidates(&mut ngrams, 3, false);
         Yake::candidate_weighting(features, &context, &mut ngrams);
 
         let mut results = ngrams
@@ -282,7 +289,7 @@ impl Yake {
     /// a given term and its predecessor AND a given term and its subsequent term,
     /// found within a window of a given size.
     fn build_context<'s>(&self, sentences: &'s [Sentence]) -> Contexts<'s> {
-        let mut contexts = Contexts::default();
+        let mut ctx = Contexts::default();
 
         for sentence in sentences {
             let mut window: VecDeque<(&String, &UTerm)> = VecDeque::with_capacity(self.config.window_size + 1);
@@ -302,8 +309,7 @@ impl Yake {
                             continue;
                         }
 
-                        contexts.entry(term).or_default().0.inc(left_uterm); // term: [.., ->left]
-                        contexts.entry(left_uterm).or_default().1.inc(term); // left: [.., ->term]
+                        ctx.track(left_uterm, term);
                     }
                 }
 
@@ -314,7 +320,7 @@ impl Yake {
             }
         }
 
-        contexts
+        ctx
     }
 
     fn is_d_tagged(&self, word: &str) -> bool {
@@ -355,7 +361,7 @@ impl Yake {
 
     /// Computes local statistic features that extract informative content within the text
     /// to calculate the importance of single terms.
-    fn extract_features<'s>(&self, contexts: &Contexts, words: Words<'s>, sentences: &'s Sentences) -> Features<'s> {
+    fn extract_features<'s>(&self, ctx: &Contexts, words: Words<'s>, sentences: &'s Sentences) -> Features<'s> {
         let tf = words.values().map(Vec::len);
 
         let words_nsw: HashMap<&UTerm, usize> = sentences
@@ -450,14 +456,8 @@ impl Yake {
             }
 
             {
-                if let Some((leftward, rightward)) = contexts.get(&u_term) {
-                    stats.dl =
-                        if leftward.is_empty() { 0. } else { leftward.distinct() as f64 / leftward.total() as f64 };
-                    stats.dr =
-                        if rightward.is_empty() { 0. } else { rightward.distinct() as f64 / rightward.total() as f64 };
-                }
-
-                stats.relatedness = 1.0 + (stats.dr + stats.dl) * (stats.tf / max_tf);
+                let (dl, dr) = ctx.diversity_of(u_term);
+                stats.relatedness = 1.0 + (dr + dl) * (stats.tf / max_tf);
             }
 
             {
@@ -476,7 +476,7 @@ impl Yake {
         features
     }
 
-    fn candidate_weighting<'s>(features: Features<'s>, contexts: &Contexts<'s>, candidates: &mut Candidates<'s>) {
+    fn candidate_weighting<'s>(features: Features<'s>, ctx: &Contexts<'s>, candidates: &mut Candidates<'s>) {
         for candidate in candidates.values_mut() {
             let lc_terms = candidate.lc_terms;
             let uq_terms = candidate.uq_terms;
@@ -491,19 +491,21 @@ impl Yake {
                         let mut prob_succ = 0.0;
                         if 0 < j {
                             // Not the first term
-                            // #previous term occuring before this one / #previous term
+                            // #previous term occurring before this one / #previous term
                             let prev_uq = uq_terms.get(j - 1).unwrap();
                             let prev_lc = lc_terms.get(j - 1).unwrap();
-                            let prev_into_stopword = contexts.get(&prev_uq).unwrap().1.get(&uq);
-                            prob_prev = prev_into_stopword as f64 / features.get(&prev_lc).unwrap().tf;
+                            prob_prev =
+                                ctx.cases_term_is_followed(&prev_uq, &uq) as f64 / features.get(&prev_lc).unwrap().tf;
                         }
                         if j < uq_terms.len() {
                             // Not the last term
-                            // #next term occuring after this one / #next term
+                            // #next term occurring after this one / #next term
                             let next_uq = uq_terms.get(j + 1).unwrap();
                             let next_lc = lc_terms.get(j + 1).unwrap();
-                            let stopword_into_next = contexts.get(&uq).unwrap().1.get(&next_uq);
-                            prob_succ = stopword_into_next as f64 / features.get(&next_lc).unwrap().tf;
+                            prob_succ =
+                                ctx.cases_term_is_followed(&uq, &next_uq) as f64 / features.get(&next_lc).unwrap().tf;
+                            // fixme: Probability P(T[i+1] | T[i]) is weird.
+                            //        Why divide by Fr(T[i]) at first, but by Fr(T[i+1]) at second?
                         }
 
                         let prob = prob_prev * prob_succ;
@@ -524,48 +526,52 @@ impl Yake {
         }
     }
 
-    fn filter_candidates(
-        &self,
-        candidates: &mut Candidates,
-        minimum_length: usize,
-        only_alphanumeric_and_hyphen: bool, // could be a function
-    ) {
-        // fixme: filter right before inserting into the set to optimize
-        candidates.retain(|_k, v| !{
-            let lc_terms = v.lc_terms;
-            let lc_words: HashSet<&LTerm> = HashSet::from_iter(lc_terms);
-
-            let has_float = || lc_words.iter().any(|&w| self.is_d_tagged(w));
-            let has_stop_word = || self.is_stopword(&lc_terms[0]) || self.is_stopword(lc_terms.last().unwrap());
-            let has_unparsable = || lc_words.iter().any(|&w| self.is_u_tagged(w));
-            let not_enough_symbols = || lc_words.iter().map(|w| w.chars().count()).sum::<usize>() < minimum_length;
-            let has_non_alphanumeric =
-                || only_alphanumeric_and_hyphen && !lc_words.iter().all(word_is_alphanumeric_and_hyphen);
-
-            // remove candidate if
-            has_float() || has_stop_word() || has_unparsable() || not_enough_symbols() || has_non_alphanumeric()
-        });
+    fn is_candidate(&self, lc_terms: &[LTerm]) -> bool {
+        let lc_words: HashSet<&LTerm> = HashSet::from_iter(lc_terms);
+
+        let has_float = || lc_words.iter().any(|&w| self.is_d_tagged(w));
+        let has_stop_word = || self.is_stopword(&lc_terms[0]) || self.is_stopword(lc_terms.last().unwrap());
+        let has_unparsable = || lc_words.iter().any(|&w| self.is_u_tagged(w));
+        let not_enough_symbols =
+            || lc_terms.iter().map(|w| w.chars().count()).sum::<usize>() < self.config.minimum_chars;
+        let has_non_alphanumeric =
+            || self.config.only_alphanumeric_and_hyphen && !lc_words.iter().all(word_is_alphanumeric_and_hyphen);
+
+        !{ has_float() || has_stop_word() || has_unparsable() || not_enough_symbols() || has_non_alphanumeric() }
     }
 
     fn ngram_selection<'s>(&self, n: usize, sentences: &'s Sentences) -> Candidates<'s> {
         let mut candidates = Candidates::new();
+        let mut ignored = HashSet::new();
+
         for sentence in sentences.iter() {
             let length = sentence.words.len();
+
             for j in 0..length {
                 for k in (j + 1..length + 1).take(n) {
                     if (j..k).is_empty() {
                         continue;
                     }
 
-                    let lc_words = &sentence.lc_terms[j..k];
-                    let candidate = candidates.entry(lc_words).or_default();
+                    let lc_terms = &sentence.lc_terms[j..k];
+
+                    if ignored.contains(lc_terms) {
+                        continue;
+                    }
+                    // todo: optimize: if some checks have failed, we may skip ngrams, by j += k
+                    if !self.is_candidate(lc_terms) {
+                        ignored.insert(lc_terms);
+                        continue;
+                    }
 
+                    let candidate = candidates.entry(lc_terms).or_default();
+                    candidate.lc_terms = lc_terms;
                     candidate.occurrences.push(&sentence.words[j..k]);
-                    candidate.lc_terms = lc_words;
                     candidate.uq_terms = &sentence.uq_terms[j..k];
                 }
             }
         }
+
         candidates
     }
 
@@ -583,784 +589,4 @@ fn word_is_alphanumeric_and_hyphen(word: impl AsRef<str>) -> bool {
 }
 
 #[cfg(test)]
-mod tests {
-    use pretty_assertions::assert_eq;
-
-    use super::*;
-
-    #[test]
-    fn short() {
-        let text = "this is a keyword";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(1));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [("keyword", "keyword", 0.1583)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn order() {
-        // Verifies that order of keywords with the same score is preserved.
-        // If not, this test becomes unstable.
-        let text = "Machine learning";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(3));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [("Machine", "machine", 0.1583), ("learning", "learning", 0.1583)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn laptop() {
-        let text = "Do you need an Apple laptop?";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(2));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [("Apple", "apple", 0.1448), ("laptop", "laptop", 0.1583)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn headphones() {
-        let text = "Do you like headphones? \
-        Starting this Saturday, we will be kicking off a huge sale of headphones! \
-        If you need headphones, we've got you covered!";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(3));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected =
-            [("headphones", "headphones", 0.1141), ("Saturday", "saturday", 0.2111), ("Starting", "starting", 0.4096)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn multi_ngram() {
-        let text = "I will give you a great deal if you just read this!";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(1));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [("great deal", "great deal", 0.0257)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn singular() {
-        let text = "One smartwatch. One phone. Many phone."; // Weird grammar; to compare with the "plural" test
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(2));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.2474)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn plural() {
-        let text = "One smartwatch. One phone. Many phones.";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(3));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.4949), ("phones", "phones", 0.4949)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn non_hyphenated() {
-        let text = "Truly high tech!"; // For comparison with the "hyphenated" test
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(1));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [("high tech", "high tech", 0.0494)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn hyphenated() {
-        let text = "Truly high-tech!";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(1));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [("high-tech", "high-tech", 0.1583)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn weekly_newsletter_short() {
-        let text = "This is your weekly newsletter!";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(3));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("weekly newsletter", "weekly newsletter", 0.0494),
-            ("newsletter", "newsletter", 0.1583),
-            ("weekly", "weekly", 0.2974),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn weekly_newsletter_long() {
-        let text = "This is your weekly newsletter! \
-            Hundreds of great deals - everything from men's fashion \
-            to high-tech drones!";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(5));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("weekly newsletter", "weekly newsletter", 0.0780),
-            ("newsletter", "newsletter", 0.2005),
-            ("weekly", "weekly", 0.3607),
-            ("great deals", "great deals", 0.4456),
-            ("high-tech drones", "high-tech drones", 0.4456),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn weekly_newsletter_long_with_paragraphs() {
-        let text = "This is your weekly newsletter!\n\n \
-            \tHundreds of great deals - everything from men's fashion \n\
-            to high-tech drones!";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(5));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("weekly newsletter", "weekly newsletter", 0.0780),
-            ("newsletter", "newsletter", 0.2005),
-            ("weekly", "weekly", 0.3607),
-            ("great deals", "great deals", 0.4456),
-            ("high-tech drones", "high-tech drones", 0.4456),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn composite_recurring_words_and_bigger_window() {
-        let text = "Machine learning is a growing field. Few research fields grow as much as machine learning grows.";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual =
-            Yake::new(stopwords, Config { ngrams: 2, window_size: 2, ..Default::default() }).get_n_best(text, Some(5));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Machine learning", "machine learning", 0.1346),
-            ("growing field", "growing field", 0.1672),
-            ("learning", "learning", 0.2265),
-            ("Machine", "machine", 0.2341),
-            ("growing", "growing", 0.2799),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn composite_recurring_words_near_numbers() {
-        let text = "I buy 100 yellow bananas every day. Every night I eat bananas - all but 5 bananas.";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(3));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected =
-            [("yellow bananas", "yellow bananas", 0.0682), ("buy", "buy", 0.1428), ("yellow", "yellow", 0.1428)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn composite_recurring_words_near_spelled_out_numbers() {
-        // For comparison with "composite_recurring_words_near_numbers" to see if numbers cause
-        let text = "I buy a hundred yellow bananas every day. Every night I eat bananas - all but five bananas.";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 2, ..Default::default() }).get_n_best(text, Some(3));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("hundred yellow", "hundred yellow", 0.0446),
-            ("yellow bananas", "yellow bananas", 0.1017),
-            ("day", "day", 0.1428),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn with_stopword_in_the_middle() {
-        let text = "Game of Thrones";
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual =
-            Yake::new(stopwords, Config { remove_duplicates: false, ..Config::default() }).get_n_best(text, Some(1));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [("Game of Thrones", "game of thrones", 0.01380)];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn google_sample_single_ngram() {
-        let text = include_str!("test_google.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config { ngrams: 1, ..Default::default() }).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Google", "google", 0.0251),
-            ("Kaggle", "kaggle", 0.0273),
-            ("data", "data", 0.08),
-            ("science", "science", 0.0983),
-            ("platform", "platform", 0.124),
-            ("service", "service", 0.1316),
-            ("acquiring", "acquiring", 0.1511),
-            ("learning", "learning", 0.1621),
-            ("Goldbloom", "goldbloom", 0.1625),
-            ("machine", "machine", 0.1672),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn google_sample_defaults() {
-        let text = include_str!("test_google.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Google", "google", 0.0251),
-            ("Kaggle", "kaggle", 0.0273),
-            ("CEO Anthony Goldbloom", "ceo anthony goldbloom", 0.0483),
-            ("data science", "data science", 0.055),
-            ("acquiring data science", "acquiring data science", 0.0603),
-            ("Google Cloud Platform", "google cloud platform", 0.0746),
-            ("data", "data", 0.08),
-            ("San Francisco", "san francisco", 0.0914),
-            ("Anthony Goldbloom declined", "anthony goldbloom declined", 0.0974),
-            ("science", "science", 0.0983),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn gitter_sample_defaults() {
-        let text = include_str!("test_gitter.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Gitter", "gitter", 0.0190),
-            ("GitLab", "gitlab", 0.0478),
-            ("acquires software chat", "acquires software chat", 0.0479),
-            ("chat startup Gitter", "chat startup gitter", 0.0512),
-            ("software chat startup", "software chat startup", 0.0612),
-            ("Gitter chat", "gitter chat", 0.0684),
-            ("GitLab acquires software", "gitlab acquires software", 0.0685),
-            ("startup", "startup", 0.0783),
-            ("software", "software", 0.0879),
-            ("code", "code", 0.0879),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn genius_sample_defaults() {
-        let text = include_str!("test_genius.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Genius", "genius", 0.0261),
-            ("company", "company", 0.0263),
-            ("Genius quietly laid", "genius quietly laid", 0.027),
-            ("company quietly laid", "company quietly laid", 0.0392),
-            ("media company", "media company", 0.0404),
-            ("Lehman", "lehman", 0.0412),
-            ("quietly laid", "quietly laid", 0.0583),
-            ("Tom Lehman told", "tom lehman told", 0.0603),
-            ("video", "video", 0.0650),
-            ("co-founder Tom Lehman", "co-founder tom lehman", 0.0669),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn german_sample_defaults() {
-        let text = include_str!("test_german.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("de").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Vereinigten Staaten", "vereinigten staaten", 0.0152), // LIAAD REFERENCE: 0.151
-            ("Präsidenten Donald Trump", "präsidenten donald trump", 0.0182),
-            ("Donald Trump", "donald trump", 0.0211), // LIAAD REFERENCE: 0.21
-            ("trifft Donald Trump", "trifft donald trump", 0.0231), // LIAAD REFERENCE: 0.23
-            ("Trump", "trump", 0.0240),
-            ("Trumps Finanzminister Steven", "trumps finanzminister steven", 0.0243),
-            ("Kanzlerin Angela Merkel", "kanzlerin angela merkel", 0.0275), // LIAAD REFERENCE: 0.273
-            ("deutsche Kanzlerin Angela", "deutsche kanzlerin angela", 0.0316), // LIAAD REFERENCE: 0.314
-            ("Merkel trifft Donald", "merkel trifft donald", 0.0353),       // LIAAD REFERENCE: 0.351
-            ("Exportnation Deutschland", "exportnation deutschland", 0.038), // LIAAD REFERENCE: 0.0379
-        ];
-
-        // REASONS FOR DISCREPANCY:
-        // - The text contains both "bereit" ("ready") and "bereits" ("already").
-        //   While "bereits" is a stopword, "bereit" is not.
-        //   LIAAD/yake keeps track of whether a term is a stopword or not
-        //   in a key-value mapping, where the key is the term, lowercase, plural-normalized.
-        //   (Note that the plural normalization techique used is rarely effective in German.)
-        //   Since "bereits" occurs before "bereit" in the text, LIAAD/yake sees it,
-        //   recognizes it is a stopword, and stores it under the key "bereit". Later,
-        //   when it encounters "bereit" (NOT a stopword), it already has that key in its
-        //   mapping so it looks it up and finds that it is a keyword (which it is not).
-        //   Meanwhile, yake-rust does not have such a key-value store, so it correctly
-        //   recognizes "bereits" as a stopword and "bereit" as a non-stopword. The extra
-        //   inclusion of "bereit" in the non-stopwords affects the TF statistics and thus
-        //   the frequency contribution to the weights, leading to slightly different scores.
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn dutch_sample_defaults() {
-        let text = include_str!("test_nl.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("nl").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Vincent van Gogh", "vincent van gogh", 0.0111),
-            ("Gogh Museum", "gogh museum", 0.0125),
-            ("Gogh", "gogh", 0.0150),
-            ("Museum", "museum", 0.0438),
-            ("brieven", "brieven", 0.0635),
-            ("Vincent", "vincent", 0.0643),
-            ("Goghs schilderijen", "goghs schilderijen", 0.1009),
-            ("Gogh verging", "gogh verging", 0.1215),
-            ("Goghs", "goghs", 0.1651),
-            ("schrijven", "schrijven", 0.1704),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn finnish_sample_defaults() {
-        let text = include_str!("test_fi.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("fi").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Mobile Networks", "mobile networks", 0.0043),
-            ("Nokia tekee muutoksia", "nokia tekee muutoksia", 0.0061),
-            ("tekee muutoksia organisaatioonsa", "tekee muutoksia organisaatioonsa", 0.0065),
-            ("johtokuntaansa vauhdittaakseen yhtiön", "johtokuntaansa vauhdittaakseen yhtiön", 0.0088),
-            ("vauhdittaakseen yhtiön strategian", "vauhdittaakseen yhtiön strategian", 0.0088),
-            ("yhtiön strategian toteuttamista", "yhtiön strategian toteuttamista", 0.0092),
-            ("Networks", "networks", 0.0102),
-            ("Networks and Applications", "networks and applications", 0.0113),
-            ("strategian toteuttamista Nokia", "strategian toteuttamista nokia", 0.0127),
-            ("siirtyy Mobile Networks", "siirtyy mobile networks", 0.0130),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn italian_sample_defaults() {
-        let text = include_str!("test_it.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("it").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(5));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Champions League", "champions league", 0.0390),
-            ("Quarti", "quarti", 0.0520),
-            ("Atlético Madrid", "atlético madrid", 0.0592),
-            ("Ottavi di finale", "ottavi di finale", 0.0646),
-            ("Real Madrid", "real madrid", 0.0701),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn french_sample_defaults() {
-        let text = include_str!("test_fr.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("fr").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("dégrade en France", "dégrade en france", 0.0254),
-            ("jusque-là uniquement associée", "jusque-là uniquement associée", 0.0504),
-            ("sondage Ifop réalisé", "sondage ifop réalisé", 0.0554),
-            ("religion se dégrade", "religion se dégrade", 0.091),
-            ("France", "france", 0.0941),
-            ("l'extrême droite", "l'extrême droite", 0.0997),
-            ("sondage Ifop", "sondage ifop", 0.101),
-            ("Islam", "islam", 0.1021),
-            ("musulmane en France", "musulmane en france", 0.1078),
-            ("Allemagne", "allemagne", 0.1086),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    #[ignore = "Crashes due to failed unwrap"]
-    fn portuguese_sport_sample_defaults() {
-        let text = include_str!("test_pt_1.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("pt").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("seleção brasileira treinará", "seleção brasileira treinará", 0.0072),
-            ("seleção brasileira", "seleção brasileira", 0.0100),
-            ("Seleção Brasileira visando", "seleção brasileira visando", 0.0192),
-            ("Seleção Brasileira encara", "seleção brasileira encara", 0.0344),
-            ("brasileira treinará", "brasileira treinará", 0.0373),
-            ("Renato Augusto", "renato augusto", 0.0376),
-            ("Copa da Rússia", "copa da rússia", 0.0407),
-            ("seleção", "seleção", 0.0454),
-            ("brasileira", "brasileira", 0.0528),
-        ];
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn portuguese_tourism_sample_defaults() {
-        let text = include_str!("test_pt_2.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("pt").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Alvor", "alvor", 0.0165),
-            ("Rio Alvor", "rio alvor", 0.0336),
-            ("Ria de Alvor", "ria de alvor", 0.0488),
-            ("encantadora vila", "encantadora vila", 0.0575),
-            ("Algarve", "algarve", 0.0774),
-            ("impressionantes de Portugal", "impressionantes de portugal", 0.0844),
-            ("estuário do Rio", "estuário do rio", 0.0907),
-            ("vila", "vila", 0.1017),
-            ("Ria", "ria", 0.1053),
-            ("Oceano Atlântico", "oceano atlântico", 0.1357),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn spanish_sample_defaults() {
-        let text = include_str!("test_es.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("es").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Guerra Civil Española", "guerra civil española", 0.0032),
-            ("Guerra Civil", "guerra civil", 0.0130),
-            ("Civil Española", "civil española", 0.0153),
-            ("Partido Socialista Obrero", "partido socialista obrero", 0.0283),
-            ("empezó la Guerra", "empezó la guerra", 0.0333),
-            ("Socialista Obrero Español", "socialista obrero español", 0.0411),
-            ("José Castillo", "josé castillo", 0.0426),
-            ("Española", "española", 0.0566),
-            ("José Antonio Primo", "josé antonio primo", 0.0589),
-            ("José Calvo Sotelo", "josé calvo sotelo", 0.0596),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn polish_sample_defaults() {
-        let text = include_str!("test_pl.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("pl").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("franka", "franka", 0.0328),
-            ("Geerta Wildersa VVD", "geerta wildersa vvd", 0.0346),
-            ("Geerta Wildersa", "geerta wildersa", 0.0399),
-            ("kurs franka", "kurs franka", 0.0486),
-            ("partii Geerta Wildersa", "partii geerta wildersa", 0.0675),
-            ("proc", "proc", 0.0692),
-            ("mld", "mld", 0.0724),
-            ("Narodowego Banku Szwajcarii", "narodowego banku szwajcarii", 0.0728),
-            ("kurs franka poniżej", "kurs franka poniżej", 0.0758),
-            ("Wildersa", "wildersa", 0.0765),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn turkish_sample_defaults() {
-        let text = include_str!("test_tr.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("tr").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("OECD", "oecd", 0.0176),                               // LIAAD REFERENCE: 0.0178
-            ("Tek Bakışta Eğitim", "tek bakışta eğitim", 0.0232),   // LIAAD REFERENCE: 0.0236
-            ("eğitim", "eğitim", 0.0274),                           // LIAAD REFERENCE: 0.0278
-            ("OECD eğitim endeksi", "oecd eğitim endeksi", 0.0313), // LIAAD REFERENCE: 0.0323
-            ("OECD ortalamasının", "oecd ortalamasının", 0.0375),   // LIAAD REFERNENCE: 0.0383
-            ("Kalkınma Örgütü'nün", "kalkınma örgütü'nün", 0.0449), // LIAAD REFERENCE: 0.045
-            ("Tek Bakışta", "tek bakışta", 0.0449),                 // LIAAD REFERENCE: 0.045
-            ("İşbirliği ve Kalkınma", "i̇şbirliği ve kalkınma", 0.0468),
-            ("Türkiye'de", "türkiye'de", 0.0476), // LIAAD REFERENCE: 0.0480
-            ("yüksek", "yüksek", 0.0509),         // LIAAD REFERENCE: 0.0513
-        ];
-
-        // REASONS FOR DISCREPANCY:
-        // - Difference in tokenization.
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn arabic_sample_defaults() {
-        let text = include_str!("test_ar.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("ar").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("عبد السلام العجيلي", "عبد السلام العجيلي", 0.0105),
-            ("اللغة العربية الأربعاء", "اللغة العربية الأربعاء", 0.0139),
-            ("عبد النبي اصطيف", "عبد النبي اصطيف", 0.0142),
-            ("العجيلي في مرآة", "العجيلي في مرآة", 0.0177),
-            ("مرآة النقد المقارن", "مرآة النقد المقارن", 0.0183), // LIAAD REFERENCE: 0.018
-            ("السلام العجيلي", "السلام العجيلي", 0.0198),
-            ("اللغة العربية", "اللغة العربية", 0.0207),
-            ("مرآة النقد", "مرآة النقد", 0.0255), // LIAAD REFERENCE: 0.025
-            ("اللغة العربية بدمشق", "اللغة العربية بدمشق", 0.0261),
-            ("مجمع اللغة العربية", "مجمع اللغة العربية", 0.0281),
-        ];
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn dataset_text_1_defaults() {
-        let text = include_str!("test_data_1.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("pt").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Médio Oriente continua", "médio oriente continua", 0.0008),
-            ("Médio Oriente", "médio oriente", 0.0045),
-            ("Oriente continua", "oriente continua", 0.0117),
-            ("registar-se violentos confrontos", "registar-se violentos confrontos", 0.0178),
-            ("Faixa de Gaza", "faixa de gaza", 0.0268),
-            ("fogo hoje voltaram", "fogo hoje voltaram", 0.0311),
-            ("voltaram a registar-se", "voltaram a registar-se", 0.0311),
-            ("registar-se violentos", "registar-se violentos", 0.0311),
-            ("Exército israelita", "exército israelita", 0.0368),
-            ("Exército israelita voltou", "exército israelita voltou", 0.0639),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn dataset_text_2_defaults() {
-        let text = include_str!("test_data_2.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(5));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("highly radioactive water", "highly radioactive water", 0.0006),
-            ("crippled nuclear plant", "crippled nuclear plant", 0.0006),
-            ("ocean Japan official", "ocean japan official", 0.0031),
-            ("Japan official", "japan official", 0.0046),
-            ("official says highly", "official says highly", 0.0050),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn dataset_text_3_defaults() {
-        let text = include_str!("test_data_3.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(5));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Global Crossing", "global crossing", 0.0034),
-            ("Hutchison Telecommunications", "hutchison telecommunications", 0.0053),
-            ("Telecommunications and Singapore", "telecommunications and singapore", 0.0072),
-            ("Singapore Technologies", "singapore technologies", 0.0072),
-            ("Technologies take control", "technologies take control", 0.0157),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn dataset_text_4_defaults() {
-        let text = include_str!("test_data_4.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("annual revenues increasing", "annual revenues increasing", 0.0018),
-            ("retail inventory management", "retail inventory management", 0.0023),
-            ("Dollar General", "dollar general", 0.0034),
-            ("inventory management", "inventory management", 0.0112),
-            ("perpetual progress", "perpetual progress", 0.0133),
-            ("revenues increasing", "revenues increasing", 0.0133),
-            ("fast track", "fast track", 0.0133),
-            ("road to perpetual", "road to perpetual", 0.0159),
-            ("annual revenues", "annual revenues", 0.0168),
-            ("stores opened", "stores opened", 0.0168),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn dataset_text_5_defaults() {
-        let text = include_str!("test_data_5.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Handoff Trigger Table", "handoff trigger table", 0.0006), // LIAAD REFERENCE: 0.0007
-            ("Handoff", "handoff", 0.0010),
-            ("WLAN Networks ABSTRACT", "wlan networks abstract", 0.0019),
-            ("Vertical handoff", "vertical handoff", 0.0020),
-            ("Handoff Trigger", "handoff trigger", 0.0021),
-            ("proactive handoff scheme", "proactive handoff scheme", 0.0021),
-            ("HTT Method Figure", "htt method figure", 0.0022),
-            ("WLAN", "wlan", 0.0023),
-            ("ABSTRACT Vertical handoff", "abstract vertical handoff", 0.0030),
-            ("traditional handoff scheme", "traditional handoff scheme", 0.0033),
-        ];
-
-        // REASONS FOR DISCREPANCY:
-        // - Difference in sentence splitting.
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn dataset_text_6_defaults() {
-        let text = include_str!("test_data_6.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("MRSA", "mrsa", 0.0047),
-            ("TSN Database", "tsn database", 0.0107),
-            ("methicillin-resistant Staphylococcus aureus", "methicillin-resistant staphylococcus aureus", 0.0116),
-            ("rates of MRSA", "rates of mrsa", 0.0145),
-            ("Staphylococcus aureus", "staphylococcus aureus", 0.0167),
-            ("methicillin-resistant Staphylococcus", "methicillin-resistant staphylococcus", 0.0177),
-            ("prevalence of MRSA", "prevalence of mrsa", 0.0201),
-            ("MRSA infections", "mrsa infections", 0.0218),
-            ("MRSA infections detected", "mrsa infections detected", 0.0223),
-            ("TSN", "tsn", 0.0250),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-
-    #[test]
-    fn dataset_text_7_defaults() {
-        let text = include_str!("test_data_7.txt"); // LIAAD/yake sample text
-        let stopwords = StopWords::predefined("en").unwrap();
-        let mut actual = Yake::new(stopwords, Config::default()).get_n_best(text, Some(10));
-        // leave only 4 digits
-        actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
-        let expected = [
-            ("Environment Design Level", "environment design level", 0.0008),
-            ("Jerusalem Jerusalem", "jerusalem jerusalem", 0.0009),
-            ("Dynamics Based Control", "dynamics based control", 0.0011),
-            ("system dynamics", "system dynamics", 0.0017),
-            ("DBC", "dbc", 0.0019),
-            ("target system dynamics", "target system dynamics", 0.0019),
-            ("target dynamics", "target dynamics", 0.0023),
-            ("Science Bar Ilan", "science bar ilan", 0.0025),
-            ("EMT", "emt", 0.0026),
-            ("Dynamics", "dynamics", 0.0026),
-        ];
-        // Results agree with reference implementation LIAAD/yake
-
-        assert_eq!(actual, expected);
-    }
-}
+mod tests;
diff --git a/yake_rust/src/stopwords/mod.rs b/yake_rust/src/stopwords/mod.rs
index c7c0337..699c68d 100644
--- a/yake_rust/src/stopwords/mod.rs
+++ b/yake_rust/src/stopwords/mod.rs
@@ -8,6 +8,7 @@ use crate::LTerm;
 /// The list is used to mark potentially meaningless tokens and generally based on the language
 /// given as input. Tokens with fewer than three characters are also considered a stopword.
 #[derive(Debug, Default, Clone)]
+#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
 pub struct StopWords {
     set: HashSet<LTerm>,
 }
diff --git a/yake_rust/src/tests.rs b/yake_rust/src/tests.rs
new file mode 100644
index 0000000..c8f6863
--- /dev/null
+++ b/yake_rust/src/tests.rs
@@ -0,0 +1,763 @@
+use pretty_assertions::assert_eq;
+
+use super::*;
+
+fn test<const T: usize>(text: &str, lang: &str, cfg: Config, n_best: Option<usize>, expected: [(&str, &str, f64); T]) {
+    let stopwords = StopWords::predefined(lang).unwrap();
+    let mut actual = Yake::new(stopwords, cfg).get_n_best(text, n_best);
+    // leave only 4 digits
+    actual.iter_mut().for_each(|r| r.score = (r.score * 10_000.).round() / 10_000.);
+    assert_eq!(actual, expected);
+}
+
+#[test]
+fn short() {
+    test("this is a keyword", "en", Config::default(), Some(1), [("keyword", "keyword", 0.1583)]);
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn keywords_order_is_preserved() {
+    // If not, this test becomes unstable.
+    test(
+        "Machine learning",
+        "en",
+        Config { ngrams: 1, ..Default::default() },
+        Some(3),
+        [("Machine", "machine", 0.1583), ("learning", "learning", 0.1583)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn laptop() {
+    test(
+        "Do you need an Apple laptop?",
+        "en",
+        Config { ngrams: 1, ..Default::default() },
+        Some(2),
+        [("Apple", "apple", 0.1448), ("laptop", "laptop", 0.1583)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn headphones() {
+    test(
+        "Do you like headphones? \
+            Starting this Saturday, we will be kicking off a huge sale of headphones! \
+            If you need headphones, we've got you covered!",
+        "en",
+        Config { ngrams: 1, ..Default::default() },
+        Some(3),
+        [("headphones", "headphones", 0.1141), ("Saturday", "saturday", 0.2111), ("Starting", "starting", 0.4096)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn multi_ngram() {
+    test(
+        "I will give you a great deal if you just read this!",
+        "en",
+        Config { ngrams: 2, ..Default::default() },
+        Some(1),
+        [("great deal", "great deal", 0.0257)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn singular() {
+    test(
+        // Weird grammar; to compare with the "plural" test
+        "One smartwatch. One phone. Many phone.",
+        "en",
+        Config { ngrams: 1, ..Default::default() },
+        Some(2),
+        [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.2474)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn plural() {
+    test(
+        "One smartwatch. One phone. Many phones.",
+        "en",
+        Config { ngrams: 1, ..Default::default() },
+        Some(3),
+        [("smartwatch", "smartwatch", 0.2025), ("phone", "phone", 0.4949), ("phones", "phones", 0.4949)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn non_hyphenated() {
+    // For comparison with the "hyphenated" test
+    test(
+        "Truly high tech!",
+        "en",
+        Config { ngrams: 2, ..Default::default() },
+        Some(1),
+        [("high tech", "high tech", 0.0494)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn hyphenated() {
+    test(
+        "Truly high-tech!",
+        "en",
+        Config { ngrams: 2, ..Default::default() },
+        Some(1),
+        [("high-tech", "high-tech", 0.1583)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn weekly_newsletter_short() {
+    test(
+        "This is your weekly newsletter!",
+        "en",
+        Config { ngrams: 2, ..Default::default() },
+        Some(3),
+        [
+            ("weekly newsletter", "weekly newsletter", 0.0494),
+            ("newsletter", "newsletter", 0.1583),
+            ("weekly", "weekly", 0.2974),
+        ],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn weekly_newsletter_long() {
+    test(
+        "This is your weekly newsletter! \
+        Hundreds of great deals - everything from men's fashion \
+        to high-tech drones!",
+        "en",
+        Config { ngrams: 2, ..Default::default() },
+        Some(5),
+        [
+            ("weekly newsletter", "weekly newsletter", 0.0780),
+            ("newsletter", "newsletter", 0.2005),
+            ("weekly", "weekly", 0.3607),
+            ("great deals", "great deals", 0.4456),
+            ("high-tech drones", "high-tech drones", 0.4456),
+        ],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn weekly_newsletter_long_with_paragraphs() {
+    test(
+        "This is your weekly newsletter!\n\n \
+        \tHundreds of great deals - everything from men's fashion \n\
+        to high-tech drones!",
+        "en",
+        Config { ngrams: 2, ..Default::default() },
+        Some(5),
+        [
+            ("weekly newsletter", "weekly newsletter", 0.0780),
+            ("newsletter", "newsletter", 0.2005),
+            ("weekly", "weekly", 0.3607),
+            ("great deals", "great deals", 0.4456),
+            ("high-tech drones", "high-tech drones", 0.4456),
+        ],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn composite_recurring_words_and_bigger_window() {
+    test(
+        "Machine learning is a growing field. Few research fields grow as much as machine learning grows.",
+        "en",
+        Config { ngrams: 2, window_size: 2, ..Default::default() },
+        Some(5),
+        [
+            ("Machine learning", "machine learning", 0.1346),
+            ("growing field", "growing field", 0.1672),
+            ("learning", "learning", 0.2265),
+            ("Machine", "machine", 0.2341),
+            ("growing", "growing", 0.2799),
+        ],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn composite_recurring_words_near_numbers() {
+    test(
+        "I buy 100 yellow bananas every day. Every night I eat bananas - all but 5 bananas.",
+        "en",
+        Config { ngrams: 2, ..Default::default() },
+        Some(3),
+        [("yellow bananas", "yellow bananas", 0.0682), ("buy", "buy", 0.1428), ("yellow", "yellow", 0.1428)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn composite_recurring_words_near_spelled_out_numbers() {
+    // For comparison with "composite_recurring_words_near_numbers" to see if numbers cause
+    test(
+        "I buy a hundred yellow bananas every day. Every night I eat bananas - all but five bananas.",
+        "en",
+        Config { ngrams: 2, ..Default::default() },
+        Some(3),
+        [
+            ("hundred yellow", "hundred yellow", 0.0446),
+            ("yellow bananas", "yellow bananas", 0.1017),
+            ("day", "day", 0.1428),
+        ],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+#[test]
+fn with_stopword_in_the_middle() {
+    test(
+        "Game of Thrones",
+        "en",
+        Config { remove_duplicates: false, ..Config::default() },
+        Some(1),
+        [("Game of Thrones", "game of thrones", 0.01380)],
+    );
+    // Results agree with reference implementation LIAAD/yake
+}
+
+mod liaad_yake_samples {
+    use super::*;
+
+    #[test]
+    fn google_sample_single_ngram() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_google.txt"),
+            "en",
+            Config { ngrams: 1, ..Default::default() },
+            Some(10),
+            [
+                ("Google", "google", 0.0251),
+                ("Kaggle", "kaggle", 0.0273),
+                ("data", "data", 0.08),
+                ("science", "science", 0.0983),
+                ("platform", "platform", 0.124),
+                ("service", "service", 0.1316),
+                ("acquiring", "acquiring", 0.1511),
+                ("learning", "learning", 0.1621),
+                ("Goldbloom", "goldbloom", 0.1625),
+                ("machine", "machine", 0.1672),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn google_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_google.txt"),
+            "en",
+            Config::default(),
+            Some(10),
+            [
+                ("Google", "google", 0.0251),
+                ("Kaggle", "kaggle", 0.0273),
+                ("CEO Anthony Goldbloom", "ceo anthony goldbloom", 0.0483),
+                ("data science", "data science", 0.055),
+                ("acquiring data science", "acquiring data science", 0.0603),
+                ("Google Cloud Platform", "google cloud platform", 0.0746),
+                ("data", "data", 0.08),
+                ("San Francisco", "san francisco", 0.0914),
+                ("Anthony Goldbloom declined", "anthony goldbloom declined", 0.0974),
+                ("science", "science", 0.0983),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn gitter_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_gitter.txt"),
+            "en",
+            Config::default(),
+            Some(10),
+            [
+                ("Gitter", "gitter", 0.0190),
+                ("GitLab", "gitlab", 0.0478),
+                ("acquires software chat", "acquires software chat", 0.0479),
+                ("chat startup Gitter", "chat startup gitter", 0.0512),
+                ("software chat startup", "software chat startup", 0.0612),
+                ("Gitter chat", "gitter chat", 0.0684),
+                ("GitLab acquires software", "gitlab acquires software", 0.0685),
+                ("startup", "startup", 0.0783),
+                ("software", "software", 0.0879),
+                ("code", "code", 0.0879),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn genius_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_genius.txt"),
+            "en",
+            Config::default(),
+            Some(10),
+            [
+                ("Genius", "genius", 0.0261),
+                ("company", "company", 0.0263),
+                ("Genius quietly laid", "genius quietly laid", 0.027),
+                ("company quietly laid", "company quietly laid", 0.0392),
+                ("media company", "media company", 0.0404),
+                ("Lehman", "lehman", 0.0412),
+                ("quietly laid", "quietly laid", 0.0583),
+                ("Tom Lehman told", "tom lehman told", 0.0603),
+                ("video", "video", 0.0650),
+                ("co-founder Tom Lehman", "co-founder tom lehman", 0.0669),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn german_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_german.txt"),
+            "de",
+            Config::default(),
+            Some(10),
+            [
+                ("Vereinigten Staaten", "vereinigten staaten", 0.0152), // LIAAD REFERENCE: 0.151
+                ("Präsidenten Donald Trump", "präsidenten donald trump", 0.0182),
+                ("Donald Trump", "donald trump", 0.0211), // LIAAD REFERENCE: 0.21
+                ("trifft Donald Trump", "trifft donald trump", 0.0231), // LIAAD REFERENCE: 0.23
+                ("Trump", "trump", 0.0240),
+                ("Trumps Finanzminister Steven", "trumps finanzminister steven", 0.0243),
+                ("Kanzlerin Angela Merkel", "kanzlerin angela merkel", 0.0275), // LIAAD REFERENCE: 0.273
+                ("deutsche Kanzlerin Angela", "deutsche kanzlerin angela", 0.0316), // LIAAD REFERENCE: 0.314
+                ("Merkel trifft Donald", "merkel trifft donald", 0.0353),       // LIAAD REFERENCE: 0.351
+                ("Exportnation Deutschland", "exportnation deutschland", 0.038), // LIAAD REFERENCE: 0.0379
+            ],
+        );
+        // REASONS FOR DISCREPANCY:
+        // - The text contains both "bereit" ("ready") and "bereits" ("already").
+        //   While "bereits" is a stopword, "bereit" is not.
+        //   LIAAD/yake keeps track of whether a term is a stopword or not
+        //   in a key-value mapping, where the key is the term, lowercase, plural-normalized.
+        //   (Note that the plural normalization techique used is rarely effective in German.)
+        //   Since "bereits" occurs before "bereit" in the text, LIAAD/yake sees it,
+        //   recognizes it is a stopword, and stores it under the key "bereit". Later,
+        //   when it encounters "bereit" (NOT a stopword), it already has that key in its
+        //   mapping so it looks it up and finds that it is a keyword (which it is not).
+        //   Meanwhile, yake-rust does not have such a key-value store, so it correctly
+        //   recognizes "bereits" as a stopword and "bereit" as a non-stopword. The extra
+        //   inclusion of "bereit" in the non-stopwords affects the TF statistics and thus
+        //   the frequency contribution to the weights, leading to slightly different scores.
+    }
+
+    #[test]
+    fn dutch_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_nl.txt"),
+            "nl",
+            Config::default(),
+            Some(10),
+            [
+                ("Vincent van Gogh", "vincent van gogh", 0.0111),
+                ("Gogh Museum", "gogh museum", 0.0125),
+                ("Gogh", "gogh", 0.0150),
+                ("Museum", "museum", 0.0438),
+                ("brieven", "brieven", 0.0635),
+                ("Vincent", "vincent", 0.0643),
+                ("Goghs schilderijen", "goghs schilderijen", 0.1009),
+                ("Gogh verging", "gogh verging", 0.1215),
+                ("Goghs", "goghs", 0.1651),
+                ("schrijven", "schrijven", 0.1704),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn finnish_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_fi.txt"),
+            "fi",
+            Config::default(),
+            Some(10),
+            [
+                ("Mobile Networks", "mobile networks", 0.0043),
+                ("Nokia tekee muutoksia", "nokia tekee muutoksia", 0.0061),
+                ("tekee muutoksia organisaatioonsa", "tekee muutoksia organisaatioonsa", 0.0065),
+                ("johtokuntaansa vauhdittaakseen yhtiön", "johtokuntaansa vauhdittaakseen yhtiön", 0.0088),
+                ("vauhdittaakseen yhtiön strategian", "vauhdittaakseen yhtiön strategian", 0.0088),
+                ("yhtiön strategian toteuttamista", "yhtiön strategian toteuttamista", 0.0092),
+                ("Networks", "networks", 0.0102),
+                ("Networks and Applications", "networks and applications", 0.0113),
+                ("strategian toteuttamista Nokia", "strategian toteuttamista nokia", 0.0127),
+                ("siirtyy Mobile Networks", "siirtyy mobile networks", 0.0130),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn italian_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_it.txt"),
+            "it",
+            Config::default(),
+            Some(5),
+            [
+                ("Champions League", "champions league", 0.0390),
+                ("Quarti", "quarti", 0.0520),
+                ("Atlético Madrid", "atlético madrid", 0.0592),
+                ("Ottavi di finale", "ottavi di finale", 0.0646),
+                ("Real Madrid", "real madrid", 0.0701),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn french_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_fr.txt"),
+            "fr",
+            Config::default(),
+            Some(10),
+            [
+                ("dégrade en France", "dégrade en france", 0.0254),
+                ("jusque-là uniquement associée", "jusque-là uniquement associée", 0.0504),
+                ("sondage Ifop réalisé", "sondage ifop réalisé", 0.0554),
+                ("religion se dégrade", "religion se dégrade", 0.091),
+                ("France", "france", 0.0941),
+                ("l'extrême droite", "l'extrême droite", 0.0997),
+                ("sondage Ifop", "sondage ifop", 0.101),
+                ("Islam", "islam", 0.1021),
+                ("musulmane en France", "musulmane en france", 0.1078),
+                ("Allemagne", "allemagne", 0.1086),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    #[ignore = "Crashes due to failed unwrap"]
+    fn portuguese_sport_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_pt_1.txt"),
+            "pt",
+            Config::default(),
+            Some(10),
+            [
+                ("seleção brasileira treinará", "seleção brasileira treinará", 0.0072),
+                ("seleção brasileira", "seleção brasileira", 0.0100),
+                ("Seleção Brasileira visando", "seleção brasileira visando", 0.0192),
+                ("Seleção Brasileira encara", "seleção brasileira encara", 0.0344),
+                ("brasileira treinará", "brasileira treinará", 0.0373),
+                ("Renato Augusto", "renato augusto", 0.0376),
+                ("Copa da Rússia", "copa da rússia", 0.0407),
+                ("seleção", "seleção", 0.0454),
+                ("brasileira", "brasileira", 0.0528),
+            ],
+        );
+    }
+
+    #[test]
+    fn portuguese_tourism_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_pt_2.txt"),
+            "pt",
+            Config::default(),
+            Some(10),
+            [
+                ("Alvor", "alvor", 0.0165),
+                ("Rio Alvor", "rio alvor", 0.0336),
+                ("Ria de Alvor", "ria de alvor", 0.0488),
+                ("encantadora vila", "encantadora vila", 0.0575),
+                ("Algarve", "algarve", 0.0774),
+                ("impressionantes de Portugal", "impressionantes de portugal", 0.0844),
+                ("estuário do Rio", "estuário do rio", 0.0907),
+                ("vila", "vila", 0.1017),
+                ("Ria", "ria", 0.1053),
+                ("Oceano Atlântico", "oceano atlântico", 0.1357),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn spanish_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_es.txt"),
+            "es",
+            Config::default(),
+            Some(10),
+            [
+                ("Guerra Civil Española", "guerra civil española", 0.0032),
+                ("Guerra Civil", "guerra civil", 0.0130),
+                ("Civil Española", "civil española", 0.0153),
+                ("Partido Socialista Obrero", "partido socialista obrero", 0.0283),
+                ("empezó la Guerra", "empezó la guerra", 0.0333),
+                ("Socialista Obrero Español", "socialista obrero español", 0.0411),
+                ("José Castillo", "josé castillo", 0.0426),
+                ("Española", "española", 0.0566),
+                ("José Antonio Primo", "josé antonio primo", 0.0589),
+                ("José Calvo Sotelo", "josé calvo sotelo", 0.0596),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn polish_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_pl.txt"),
+            "pl",
+            Config::default(),
+            Some(10),
+            [
+                ("franka", "franka", 0.0328),
+                ("Geerta Wildersa VVD", "geerta wildersa vvd", 0.0346),
+                ("Geerta Wildersa", "geerta wildersa", 0.0399),
+                ("kurs franka", "kurs franka", 0.0486),
+                ("partii Geerta Wildersa", "partii geerta wildersa", 0.0675),
+                ("proc", "proc", 0.0692),
+                ("mld", "mld", 0.0724),
+                ("Narodowego Banku Szwajcarii", "narodowego banku szwajcarii", 0.0728),
+                ("kurs franka poniżej", "kurs franka poniżej", 0.0758),
+                ("Wildersa", "wildersa", 0.0765),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn turkish_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_tr.txt"),
+            "tr",
+            Config::default(),
+            Some(10),
+            [
+                ("OECD", "oecd", 0.0176),                               // LIAAD REFERENCE: 0.0178
+                ("Tek Bakışta Eğitim", "tek bakışta eğitim", 0.0232),   // LIAAD REFERENCE: 0.0236
+                ("eğitim", "eğitim", 0.0274),                           // LIAAD REFERENCE: 0.0278
+                ("OECD eğitim endeksi", "oecd eğitim endeksi", 0.0313), // LIAAD REFERENCE: 0.0323
+                ("OECD ortalamasının", "oecd ortalamasının", 0.0375),   // LIAAD REFERNENCE: 0.0383
+                ("Kalkınma Örgütü'nün", "kalkınma örgütü'nün", 0.0449), // LIAAD REFERENCE: 0.045
+                ("Tek Bakışta", "tek bakışta", 0.0449),                 // LIAAD REFERENCE: 0.045
+                ("İşbirliği ve Kalkınma", "i̇şbirliği ve kalkınma", 0.0468),
+                ("Türkiye'de", "türkiye'de", 0.0476), // LIAAD REFERENCE: 0.0480
+                ("yüksek", "yüksek", 0.0509),         // LIAAD REFERENCE: 0.0513
+            ],
+        );
+        // REASONS FOR DISCREPANCY:
+        // - Difference in tokenization.
+    }
+
+    #[test]
+    fn arabic_sample_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_ar.txt"),
+            "ar",
+            Config::default(),
+            Some(10),
+            [
+                ("عبد السلام العجيلي", "عبد السلام العجيلي", 0.0105),
+                ("اللغة العربية الأربعاء", "اللغة العربية الأربعاء", 0.0139),
+                ("عبد النبي اصطيف", "عبد النبي اصطيف", 0.0142),
+                ("العجيلي في مرآة", "العجيلي في مرآة", 0.0177),
+                ("مرآة النقد المقارن", "مرآة النقد المقارن", 0.0183), // LIAAD REFERENCE: 0.018
+                ("السلام العجيلي", "السلام العجيلي", 0.0198),
+                ("اللغة العربية", "اللغة العربية", 0.0207),
+                ("مرآة النقد", "مرآة النقد", 0.0255), // LIAAD REFERENCE: 0.025
+                ("اللغة العربية بدمشق", "اللغة العربية بدمشق", 0.0261),
+                ("مجمع اللغة العربية", "مجمع اللغة العربية", 0.0281),
+            ],
+        );
+    }
+
+    #[test]
+    fn dataset_text_1_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_data_1.txt"),
+            "pt",
+            Config::default(),
+            Some(10),
+            [
+                ("Médio Oriente continua", "médio oriente continua", 0.0008),
+                ("Médio Oriente", "médio oriente", 0.0045),
+                ("Oriente continua", "oriente continua", 0.0117),
+                ("registar-se violentos confrontos", "registar-se violentos confrontos", 0.0178),
+                ("Faixa de Gaza", "faixa de gaza", 0.0268),
+                ("fogo hoje voltaram", "fogo hoje voltaram", 0.0311),
+                ("voltaram a registar-se", "voltaram a registar-se", 0.0311),
+                ("registar-se violentos", "registar-se violentos", 0.0311),
+                ("Exército israelita", "exército israelita", 0.0368),
+                ("Exército israelita voltou", "exército israelita voltou", 0.0639),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn dataset_text_2_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_data_2.txt"),
+            "en",
+            Config::default(),
+            Some(5),
+            [
+                ("highly radioactive water", "highly radioactive water", 0.0006),
+                ("crippled nuclear plant", "crippled nuclear plant", 0.0006),
+                ("ocean Japan official", "ocean japan official", 0.0031),
+                ("Japan official", "japan official", 0.0046),
+                ("official says highly", "official says highly", 0.0050),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn dataset_text_3_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_data_3.txt"),
+            "en",
+            Config::default(),
+            Some(5),
+            [
+                ("Global Crossing", "global crossing", 0.0034),
+                ("Hutchison Telecommunications", "hutchison telecommunications", 0.0053),
+                ("Telecommunications and Singapore", "telecommunications and singapore", 0.0072),
+                ("Singapore Technologies", "singapore technologies", 0.0072),
+                ("Technologies take control", "technologies take control", 0.0157),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn dataset_text_4_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_data_4.txt"),
+            "en",
+            Config::default(),
+            Some(10),
+            [
+                ("annual revenues increasing", "annual revenues increasing", 0.0018),
+                ("retail inventory management", "retail inventory management", 0.0023),
+                ("Dollar General", "dollar general", 0.0034),
+                ("inventory management", "inventory management", 0.0112),
+                ("perpetual progress", "perpetual progress", 0.0133),
+                ("revenues increasing", "revenues increasing", 0.0133),
+                ("fast track", "fast track", 0.0133),
+                ("road to perpetual", "road to perpetual", 0.0159),
+                ("annual revenues", "annual revenues", 0.0168),
+                ("stores opened", "stores opened", 0.0168),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn dataset_text_5_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_data_5.txt"),
+            "en",
+            Config::default(),
+            Some(10),
+            [
+                ("Handoff Trigger Table", "handoff trigger table", 0.0006), // LIAAD REFERENCE: 0.0007
+                ("Handoff", "handoff", 0.0010),
+                ("WLAN Networks ABSTRACT", "wlan networks abstract", 0.0019),
+                ("Vertical handoff", "vertical handoff", 0.0020),
+                ("Handoff Trigger", "handoff trigger", 0.0021),
+                ("proactive handoff scheme", "proactive handoff scheme", 0.0021),
+                ("HTT Method Figure", "htt method figure", 0.0022),
+                ("WLAN", "wlan", 0.0023),
+                ("ABSTRACT Vertical handoff", "abstract vertical handoff", 0.0030),
+                ("traditional handoff scheme", "traditional handoff scheme", 0.0033),
+            ],
+        );
+        // REASONS FOR DISCREPANCY:
+        // - Difference in sentence splitting.
+    }
+
+    #[test]
+    fn dataset_text_6_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_data_6.txt"),
+            "en",
+            Config::default(),
+            Some(10),
+            [
+                ("MRSA", "mrsa", 0.0047),
+                ("TSN Database", "tsn database", 0.0107),
+                ("methicillin-resistant Staphylococcus aureus", "methicillin-resistant staphylococcus aureus", 0.0116),
+                ("rates of MRSA", "rates of mrsa", 0.0145),
+                ("Staphylococcus aureus", "staphylococcus aureus", 0.0167),
+                ("methicillin-resistant Staphylococcus", "methicillin-resistant staphylococcus", 0.0177),
+                ("prevalence of MRSA", "prevalence of mrsa", 0.0201),
+                ("MRSA infections", "mrsa infections", 0.0218),
+                ("MRSA infections detected", "mrsa infections detected", 0.0223),
+                ("TSN", "tsn", 0.0250),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+
+    #[test]
+    fn dataset_text_7_defaults() {
+        // LIAAD/yake sample text
+        test(
+            include_str!("test_data_7.txt"),
+            "en",
+            Config::default(),
+            Some(10),
+            [
+                ("Environment Design Level", "environment design level", 0.0008),
+                ("Jerusalem Jerusalem", "jerusalem jerusalem", 0.0009),
+                ("Dynamics Based Control", "dynamics based control", 0.0011),
+                ("system dynamics", "system dynamics", 0.0017),
+                ("DBC", "dbc", 0.0019),
+                ("target system dynamics", "target system dynamics", 0.0019),
+                ("target dynamics", "target dynamics", 0.0023),
+                ("Science Bar Ilan", "science bar ilan", 0.0025),
+                ("EMT", "emt", 0.0026),
+                ("Dynamics", "dynamics", 0.0026),
+            ],
+        );
+        // Results agree with reference implementation LIAAD/yake
+    }
+}