diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..e27dc57 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,28 @@ +{ + "cSpell.words": [ + "ahash", + "Bioinformatics", + "bstr", + "byteset", + "bytesets", + "bytesum", + "corasick", + "Dataframe", + "gxhash", + "lexsort", + "Melem", + "memchr", + "memmem", + "MergeSort", + "QuickSort", + "Needleman", + "rapidfuzz", + "rfind", + "Skylake", + "stringwars", + "stringzilla", + "strstr", + "tfidf", + "Wunsch" + ] +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 5b5ae31..3d84cae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,16 +1,45 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 + +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.2.12", + "once_cell", + "version_check", + "zerocopy 0.7.35", +] [[package]] name = "aho-corasick" -version = "1.1.2" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anes" version = "0.1.6" @@ -23,18 +52,401 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" +[[package]] +name = "anyhow" +version = "1.0.97" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcfed56ad506cb2c684a14971b8861fdc3baaaae314b9e5f9bb532cbe3ba7a4f" + +[[package]] +name = "approx" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab112f0a86d568ea0e627cc1d6be74a1e9cd55214684db5561995f6dad897c6" +dependencies = [ + "num-traits", +] + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "arrow" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc208515aa0151028e464cc94a692156e945ce5126abd3537bb7fd6ba2143ed1" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e07e726e2b3f7816a85c6a45b6ec118eeeabf0b2a8c208122ad949437181f49a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num", +] + +[[package]] +name = "arrow-array" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2262eba4f16c78496adfd559a29fe4b24df6088efc9985a873d58e92be022d5" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e899dade2c3b7f5642eb8366cfd898958bcca099cde6dfea543c7e8d3ad88d4" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4103d88c5b441525ed4ac23153be7458494c2b0c9a11115848fdb9b81f6f886a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "half", + "lexical-core", + "num", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d3cb0914486a3cae19a5cad2598e44e225d53157926d0ada03c20521191a65" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "regex", +] + +[[package]] +name = "arrow-data" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a329fb064477c9ec5f0870d2f5130966f91055c7c5bce2b3a084f116bc28c3b" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddecdeab02491b1ce88885986e25002a3da34dd349f682c7cfe67bab7cc17b86" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d03b9340013413eb84868682ace00a1098c81a5ebc96d279f7ebf9a4cac3c0fd" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap", + "lexical-core", + "num", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-ord" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f841bfcc1997ef6ac48ee0305c4dfceb1f7c786fe31e67c1186edf775e1f1160" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1eeb55b0a0a83851aa01f2ca5ee5648f607e8506ba6802577afdda9d75cdedcd" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85934a9d0261e0fa5d4e2a5295107d743b543a6e0484a835d4b8db2da15306f9" + +[[package]] +name = "arrow-select" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e2932aece2d0c869dd2125feb9bd1709ef5c445daa3838ac4112dcfa0fda52c" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "54.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "912e38bd6a7a7714c1d9b61df80315685553b7455e8a6045c27531d8ecd5b458" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num", + "regex", + "regex-syntax", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "autocfg" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bio" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0875bce309de30d684a736aaf628bf9edbd3bdad896eb11b72a182155a5fb97f" +dependencies = [ + "anyhow", + "approx", + "bio-types", + "bit-set", + "bv", + "bytecount", + "csv", + "custom_derive", + "editdistancek", + "enum-map", + "fxhash", + "itertools", + "itertools-num", + "lazy_static", + "multimap", + "ndarray", + "newtype_derive", + "num-integer", + "num-traits", + "ordered-float", + "petgraph", + "rand 0.8.5", + "regex", + "serde", + "serde_derive", + "statrs", + "strum", + "strum_macros", + "thiserror 2.0.12", + "triple_accel", + "vec_map", +] + +[[package]] +name = "bio-types" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4dcf54f8b7f51450207d54780bab09c05f30b8b0caa991545082842e466ad7e" +dependencies = [ + "derive-new", + "lazy_static", + "regex", + "strum_macros", + "thiserror 1.0.69", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c8214115b7bf84099f1309324e63141d4c5d7cc26862f97a0a857dbefe165bd" + +[[package]] +name = "blake3" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "675f87afced0413c9bb02843499dbbd3882a237645883f71a2b59644a6d2f753" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + +[[package]] +name = "bstr" +version = "1.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "531a9155a481e2ee699d4f98f43c0ca4ff8ee1bfd55c31e9e98fb29d2b176fe0" +dependencies = [ + "memchr", +] + [[package]] name = "bumpalo" version = "3.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" +[[package]] +name = "bv" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8834bb1d8ee5dc048ee3124f2c7c1afcc6bc9aed03f11e9dfd8c69470a5db340" +dependencies = [ + "feature-probe", + "serde", +] + +[[package]] +name = "bytecount" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" + +[[package]] +name = "bytemuck" +version = "1.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6b1fc10dbac614ebc03540c9dbd60e83887fda27794998c6528f1782047d540" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + [[package]] name = "cast" version = "0.3.0" @@ -43,9 +455,14 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.0.85" +version = "1.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b918671670962b48bc23753aef0c51d072dca6f52f01f800854ada6ddb7f7d3" +checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c" +dependencies = [ + "jobserver", + "libc", + "shlex", +] [[package]] name = "cfg-if" @@ -53,6 +470,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e36cc9d416881d2e24f9a963be5fb1cd90966419ac844274161d10488b3e825" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "windows-targets 0.52.0", +] + [[package]] name = "ciborium" version = "0.2.2" @@ -68,213 +497,654 @@ dependencies = [ name = "ciborium-io" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clang" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84c044c781163c001b913cd018fc95a628c50d0d2dfea8bca77dad71edb16e37" +dependencies = [ + "clang-sys", + "libc", +] + +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", +] + +[[package]] +name = "clap" +version = "4.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c918d541ef2913577a0f9566e9ce27cb35b6df072075769e0b26cb5a554520da" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f3e7391dad68afb0c2ede1bf619f579a3dc9c2ec67f089baa397123a2f3d1eb" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.12", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "csv" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" +dependencies = [ + "memchr", +] + +[[package]] +name = "custom_derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" + +[[package]] +name = "derive-new" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d150dea618e920167e5973d70ae6ece4385b7164e0d799fe7c122dd0a5d912ad" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "editdistancek" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e02df23d5b1c6f9e69fa603b890378123b93073df998a21e6e33b9db0a32613" + +[[package]] +name = "either" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" + +[[package]] +name = "enum-map" +version = "2.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6866f3bfdf8207509a033af1a75a7b08abda06bbaaeae6669323fd5a097df2e9" +dependencies = [ + "enum-map-derive", +] + +[[package]] +name = "enum-map-derive" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f282cfdfe92516eb26c2af8589c274c7c17681f5ecc03c18255fe741c6aa64eb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "feature-probe" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flatbuffers" +version = "24.12.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" +dependencies = [ + "bitflags 1.3.2", + "rustc_version 0.4.1", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.13.3+wasi-0.2.2", + "windows-targets 0.52.0", +] + +[[package]] +name = "glob" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" + +[[package]] +name = "gxhash" +version = "3.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a197c9b654827513cf53842c5c6d3da2b4b35a785f8e0eff78bdf8e445aba1bb" +dependencies = [ + "rustversion", +] + +[[package]] +name = "half" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd" + +[[package]] +name = "iana-time-zone" +version = "0.1.61" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "235e081f3925a06703c2d0117ea8b91f042756fd6e7a6e5d901e8ca1a996b220" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core 0.52.0", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3954d50fe15b02142bf25d3b8bdadb634ec3948f103d04ffe3031bc8fe9d7058" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "is-terminal" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itertools-num" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a872a22f9e6f7521ca557660adb96dd830e54f0f490fa115bb55dd69d38b27e7" +dependencies = [ + "num-traits", +] + +[[package]] +name = "itoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] -name = "ciborium-ll" -version = "0.2.2" +name = "jobserver" +version = "0.1.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0" dependencies = [ - "ciborium-io", - "half", + "libc", ] [[package]] -name = "clap" -version = "4.5.1" +name = "js-sys" +version = "0.3.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c918d541ef2913577a0f9566e9ce27cb35b6df072075769e0b26cb5a554520da" +checksum = "406cda4b368d531c842222cf9d2600a9a4acce8d29423695379c6868a143a9ee" dependencies = [ - "clap_builder", + "wasm-bindgen", ] [[package]] -name = "clap_builder" -version = "4.5.1" +name = "lazy_static" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f3e7391dad68afb0c2ede1bf619f579a3dc9c2ec67f089baa397123a2f3d1eb" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "lexical-core" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" dependencies = [ - "anstyle", - "clap_lex", + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", ] [[package]] -name = "clap_lex" -version = "0.7.0" +name = "lexical-parse-float" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] [[package]] -name = "criterion" -version = "0.5.1" +name = "lexical-parse-integer" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ - "anes", - "cast", - "ciborium", - "clap", - "criterion-plot", - "is-terminal", - "itertools", - "num-traits", - "once_cell", - "oorandom", - "plotters", - "rayon", - "regex", - "serde", - "serde_derive", - "serde_json", - "tinytemplate", - "walkdir", + "lexical-util", + "static_assertions", ] [[package]] -name = "criterion-plot" -version = "0.5.0" +name = "lexical-util" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" dependencies = [ - "cast", - "itertools", + "static_assertions", ] [[package]] -name = "crossbeam-deque" -version = "0.8.5" +name = "lexical-write-float" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", + "lexical-util", + "lexical-write-integer", + "static_assertions", ] [[package]] -name = "crossbeam-epoch" -version = "0.9.18" +name = "lexical-write-integer" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" dependencies = [ - "crossbeam-utils", + "lexical-util", + "static_assertions", ] [[package]] -name = "crossbeam-utils" -version = "0.8.19" +name = "libc" +version = "0.2.171" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" +checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6" [[package]] -name = "crunchy" -version = "0.2.2" +name = "libm" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" [[package]] -name = "either" -version = "1.10.0" +name = "log" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] -name = "getrandom" -version = "0.2.12" +name = "matrixmultiply" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "9380b911e3e96d10c1f415da0876389aaf1b56759054eeb0de7df940c456ba1a" dependencies = [ - "cfg-if", - "libc", - "wasi", + "autocfg", + "rawpointer", ] [[package]] -name = "half" -version = "2.3.1" +name = "memchr" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "multimap" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "defc4c55412d89136f966bbb339008b474350e5e6e78d2714439c386b3137a03" dependencies = [ - "cfg-if", - "crunchy", + "serde", ] [[package]] -name = "hermit-abi" -version = "0.3.6" +name = "nalgebra" +version = "0.33.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd" +checksum = "26aecdf64b707efd1310e3544d709c5c0ac61c13756046aaaba41be5c4f66a3b" +dependencies = [ + "approx", + "matrixmultiply", + "num-complex", + "num-rational", + "num-traits", + "rand 0.8.5", + "rand_distr", + "simba", + "typenum", +] [[package]] -name = "is-terminal" -version = "0.4.12" +name = "ndarray" +version = "0.16.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" dependencies = [ - "hermit-abi", - "libc", - "windows-sys", + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", ] [[package]] -name = "itertools" -version = "0.10.5" +name = "newtype_derive" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +checksum = "ac8cd24d9f185bb7223958d8c1ff7a961b74b1953fd05dba7cc568a63b3861ec" dependencies = [ - "either", + "rustc_version 0.1.7", ] [[package]] -name = "itoa" -version = "1.0.10" +name = "num" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +checksum = "35bd024e8b2ff75562e5f34e7f4905839deb4b22955ef5e73d2fea1b9813cb23" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] [[package]] -name = "js-sys" -version = "0.3.68" +name = "num-bigint" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "406cda4b368d531c842222cf9d2600a9a4acce8d29423695379c6868a143a9ee" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" dependencies = [ - "wasm-bindgen", + "num-integer", + "num-traits", ] [[package]] -name = "libc" -version = "0.2.153" +name = "num-complex" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] [[package]] -name = "log" -version = "0.4.20" +name = "num-integer" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] [[package]] -name = "memchr" -version = "2.7.1" +name = "num-iter" +version = "0.1.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] [[package]] -name = "memchr_vs_stringzilla" -version = "0.1.0" +name = "num-rational" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f83d14da390562dca69fc84082e73e548e1ad308d24accdedd2720017cb37824" dependencies = [ - "criterion", - "memchr", - "rand", - "stringzilla", + "num-bigint", + "num-integer", + "num-traits", ] [[package]] name = "num-traits" -version = "0.2.18" +version = "0.2.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -289,6 +1159,78 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "opencv" +version = "0.94.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f80fd7d018d20b1e49bdd65e72350f1f63cad6bc9c15f850c47c31a6ad8d0d20" +dependencies = [ + "cc", + "dunce", + "jobserver", + "libc", + "num-traits", + "once_cell", + "opencv-binding-generator", + "pkg-config", + "semver 1.0.26", + "shlex", + "vcpkg", + "windows", +] + +[[package]] +name = "opencv-binding-generator" +version = "0.95.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7283829fe440be381fea73521f850b287fd44f994acd6453e1e19b3d479ef7fc" +dependencies = [ + "clang", + "clang-sys", + "dunce", + "once_cell", + "percent-encoding", + "regex", + "shlex", +] + +[[package]] +name = "ordered-float" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2c1f9f56e534ac6a9b8a4600bdf0f530fb393b5f393e7b4d03489c3cf0c3f01" +dependencies = [ + "num-traits", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset", + "indexmap", +] + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "plotters" version = "0.3.5" @@ -317,6 +1259,21 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "portable-atomic" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -325,9 +1282,9 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.94" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" dependencies = [ "unicode-ident", ] @@ -348,8 +1305,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3779b94aeb87e8bd4e834cee3650289ee9e0d5677f976ecdb6d219e5f4f6cd94" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.3", + "zerocopy 0.8.24", ] [[package]] @@ -359,7 +1327,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.3", ] [[package]] @@ -368,14 +1346,54 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom", + "getrandom 0.2.12", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom 0.3.1", +] + +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.3", ] +[[package]] +name = "rapidfuzz" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "270e04e5ea61d40841942bb15e451c29ee1618637bcf97fc7ede5dd4a9b1601b" + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "rayon" -version = "1.8.1" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7237101a77a10773db45d62004a272517633fbcc3df19d96455ede1122e051" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" dependencies = [ "either", "rayon-core", @@ -392,33 +1410,57 @@ dependencies = [ ] [[package]] -name = "regex" -version = "1.10.3" +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustc_version" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "c5f5376ea5e30ce23c03eb77cbe4962b988deead10910c372b226388b594c084" dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", + "semver 0.1.20", ] [[package]] -name = "regex-automata" -version = "0.4.5" +name = "rustc_version" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", + "semver 1.0.26", ] [[package]] -name = "regex-syntax" -version = "0.8.2" +name = "rustversion" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "f7c45b9784283f1b2e7fb61b42047c2fd678ef0960d4f6f1eba131594cc369d4" [[package]] name = "ryu" @@ -426,6 +1468,15 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" +[[package]] +name = "safe_arch" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96b02de82ddbe1b636e6170c21be622223aea188ef2e139be0a5b219ec215323" +dependencies = [ + "bytemuck", +] + [[package]] name = "same-file" version = "1.0.6" @@ -435,6 +1486,18 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "semver" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4f410fedcf71af0345d7607d246e7ad15faaadd49d240ee3b24e5dc21a820ac" + +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + [[package]] name = "serde" version = "1.0.197" @@ -467,25 +1530,155 @@ dependencies = [ ] [[package]] -name = "stringzilla" -version = "3.3.0" +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "simba" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3a386a501cd104797982c15ae17aafe8b9261315b5d07e3ec803f2ea26be0fa" +dependencies = [ + "approx", + "num-complex", + "num-traits", + "paste", + "wide", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "statrs" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53a7521ac3427b9007b364cb7b412d5d2634c6a1108d95b73dd55d7341379df2" +checksum = "2a3fe7c28c6512e766b0874335db33c94ad7b8f9054228ae1c2abd47ce7d335e" +dependencies = [ + "approx", + "nalgebra", + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "stringwars" +version = "0.1.0" +dependencies = [ + "ahash", + "aho-corasick", + "arrow", + "bio", + "blake3", + "bstr", + "criterion", + "getrandom 0.3.1", + "gxhash", + "memchr", + "opencv", + "rand 0.9.0", + "rand_chacha 0.9.0", + "rand_xoshiro", + "rapidfuzz", + "rayon", + "regex", + "stringzilla", + "twox-hash", + "xxhash-rust", + "zeroize", +] + +[[package]] +name = "stringzilla" +version = "3.11.3" dependencies = [ "cc", ] +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "syn" -version = "2.0.50" +version = "2.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74f1bdc9872430ce9b75da68329d1c1746faf50ffac5f19e02b71e37ff881ffb" +checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +dependencies = [ + "thiserror-impl 2.0.12", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -496,12 +1689,54 @@ dependencies = [ "serde_json", ] +[[package]] +name = "triple_accel" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22048bc95dfb2ffd05b1ff9a756290a009224b60b2f0e7525faeee7603851e63" + +[[package]] +name = "twox-hash" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908" +dependencies = [ + "rand 0.8.5", +] + +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" +dependencies = [ + "serde", +] + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.4.0" @@ -518,6 +1753,15 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +[[package]] +name = "wasi" +version = "0.13.3+wasi-0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" +dependencies = [ + "wit-bindgen-rt", +] + [[package]] name = "wasm-bindgen" version = "0.2.91" @@ -582,6 +1826,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "wide" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41b5576b9a81633f3e8df296ce0063042a73507636cbe956c61133dd7034ab22" +dependencies = [ + "bytemuck", + "safe_arch", +] + [[package]] name = "winapi" version = "0.3.9" @@ -613,13 +1867,91 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f919aee0a93304be7f62e8e5027811bbba96bcb1de84d6618be56e43f8a32a1" +dependencies = [ + "windows-core 0.59.0", + "windows-targets 0.53.0", +] + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.0", +] + +[[package]] +name = "windows-core" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "810ce18ed2112484b0d4e15d022e5f598113e220c53e373fb31e67e21670c1ce" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-result", + "windows-strings", + "windows-targets 0.53.0", +] + +[[package]] +name = "windows-implement" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83577b051e2f49a058c308f17f273b570a6a758386fc291b5f6a934dd84e48c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb26fd936d991781ea39e87c3a27285081e3c0da5ca0fcbc02d368cc6f52ff01" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dccfd733ce2b1753b03b6d3c65edf020262ea35e20ccdf3e288043e6dd620e3" + +[[package]] +name = "windows-result" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06374efe858fab7e4f881500e6e86ec8bc28f9462c47e5a9941a0142ad86b189" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87fa48cc5d406560701792be122a10132491cff9d0aeb23583cc2dcafc847319" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.0", ] [[package]] @@ -628,13 +1960,29 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + +[[package]] +name = "windows-targets" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -643,38 +1991,147 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + +[[package]] +name = "wit-bindgen-rt" +version = "0.33.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" +dependencies = [ + "bitflags 2.9.0", +] + +[[package]] +name = "xxhash-rust" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a5cbf750400958819fb6178eaa83bee5cd9c29a26a40cc241df8c70fdd46984" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "zerocopy-derive 0.7.35", +] + +[[package]] +name = "zerocopy" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879" +dependencies = [ + "zerocopy-derive 0.8.24", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" diff --git a/Cargo.toml b/Cargo.toml index e37ce43..6f96f6e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,15 +1,156 @@ [package] -name = "memchr_vs_stringzilla" +name = "stringwars" version = "0.1.0" edition = "2018" [dependencies] -rand = "0.8.5" criterion = "0.5.1" -memchr = { version = "2.7.1", default-features = false } -stringzilla = { version = "3.3.0" } +# stringzilla = { version = "3.3.0" } +stringzilla = { path = "../StringZilla/" } + +# Feature-based dependencies for benchmarks +[features] +bench_find = [ + "memchr", # Substring Search + "bstr", # Byteset Search + "aho-corasick", # Byteset Search + "regex", # Byteset Search +] +bench_hash = [ + "ahash", # Hashing + "xxhash-rust", # One-Shot Hashing + "twox-hash", # One-Shot Hashing (same algo as xxHash) + "gxhash", # One-Shot Hashing + "blake3", # Cryptographic Hashing Baseline +] +bench_sequence = [ + "arrow", # Sorting + "rayon", # Parallel Sorting +] +bench_similarity = [ + "rapidfuzz", # Levenshtein Distance (also `strsim`) + "bio", # Needleman-Wunsch Score +] +bench_memory = [ + # "opencv", # Lookup Transform + "rand", # Randomize Buffer + "zeroize", # Obfuscate Buffer + "getrandom", # Randomize Buffer via OS + "rand_chacha", # Randomize Buffer + "rand_xoshiro", # Randomize Buffer +] + +# TODO: String properties: +# - `unicode-width` +# - `textwrap` +# - `unicode-segmentation` + +[dependencies.memchr] +version = "2.7.1" +default-features = false +optional = true + +[dependencies.bio] +version = "2.2.0" +default-features = false +optional = true + +[dependencies.bstr] +version = "1.11.3" +default-features = false +optional = true + +[dependencies.aho-corasick] +version = "1.1.3" +optional = true + +[dependencies.regex] +version = "1.11.1" +optional = true + +[dependencies.rapidfuzz] +version = "0.5.0" +optional = true + +[dependencies.blake3] +version = "1.6.1" +optional = true + +[dependencies.gxhash] +version = "3.4.1" +optional = true + +[dependencies.ahash] +version = "0.8" +optional = true + +[dependencies.xxhash-rust] +version = "0.8" +optional = true +features = ["xxh3", "const_xxh3"] + +[dependencies.twox-hash] +version = "2.1.0" +optional = true + +[dependencies.arrow] +version = "54.2.1" +optional = true + +[dependencies.rayon] +version = "1.10.0" +optional = true + +[dependencies.opencv] +version = "0.94.2" +optional = true + +[dependencies.zeroize] +version = "1.8.1" +optional = true + +[dependencies.getrandom] +version = "0.3.1" +optional = true + +[dependencies.rand] +version = "0.9.0" +optional = true + +[dependencies.rand_chacha] +version = "0.9.0" +optional = true + +[dependencies.rand_xoshiro] +version = "0.7.0" +optional = true + +[[bench]] +name = "bench_find" +path = "bench_find.rs" +harness = false +required-features = ["bench_find"] + +[[bench]] +name = "bench_similarity" +path = "bench_similarity.rs" +harness = false +required-features = ["bench_similarity"] + +[[bench]] +name = "bench_hash" +path = "bench_hash.rs" +harness = false +required-features = ["bench_hash"] + +[[bench]] +name = "bench_sequence" +path = "bench_sequence.rs" +harness = false +required-features = ["bench_sequence"] [[bench]] -name = "bench" +name = "bench_memory" +path = "bench_memory.rs" harness = false -path = "bench.rs" +required-features = ["bench_memory"] diff --git a/README.md b/README.md index acff49e..9600fb5 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,126 @@ -# [`memchr`](https://github.com/BurntSushi/memchr) vs [`stringzilla`](https://github.com/ashvardanian/StringZilla) +# StringWa.rs -## Rust Substring Search Benchmarks +![StringWa.rs Thumbnail](https://github.com/ashvardanian/ashvardanian/blob/master/repositories/StringWa.rs.jpg?raw=true) + +_Not to pick a fight, but let there be String Wars!_ 😅 +Jokes aside, many __great__ libraries for string processing exist. +_Mostly, of course, written in Assembly, C, and C++, but some in Rust as well._ 😅 + +Where Rust decimates C and C++, however, is the __simplicity__ of dependency management, making it great for benchmarking "Systems Software"! +So, to accelerate the development of the [`StringZilla`](https://github.com/ashvardanian/StringZilla) C library, I've created this repository to compare it against some of my & communities most beloved Rust projects, like: + +- [`memchr`](https://github.com/BurntSushi/memchr) for substring search. +- [`rapidfuzz`](https://github.com/rapidfuzz/rapidfuzz-rs) for edit distances. +- [`aHash`](https://github.com/tkaitchuck/aHash) for hashing. +- [`aho_corasick`](https://github.com/BurntSushi/aho-corasick) for multi-pattern search. +- [`tantivy`](https://github.com/quickwit-oss/tantivy) for document retrieval. + +Of course, the functionality of the projects is different, as are the APIs and the usage patterns. +So, I focus on the workloads for which StringZilla was designed and compare the throughput of the core operations. +Notably, I also favor modern hardware with support for a wider range SIMD instructions, like mask-equipped AVX-512 on x86 starting from the 2015 Intel Skylake-X CPUs or more recent predicated variable-length SVE and SVE2 on Arm, that aren't supported by most of the existing libraries and Rust tooling. + +## String Hashing Benchmarks + +Many great hashing libraries exist in Rust, C, and C++. +Typical top choices are `aHash`, `xxHash`, `blake3`, `gxhash`, `CityHash`, `MurmurHash`, or the native `std::hash`. +Many of them have similar pitfalls: + +- They are not always documented to have a certain reproducible output and are recommended for use only for local in-memory construction of hash tables, not for serialization or network communication. +- They don't always support streaming and require the whole input to be available in memory at once. +- They don't always pass the SMHasher test suite, especially with `--extra` checks enabled. +- They generally don't have a dynamic dispatch mechanism to simplify shipping of precompiled software to a wide range of users. + +StringZilla addresses those issues and seems to provide competitive performance. +On Intel Sapphire Rapids CPU, on `xlsum.csv` dataset, the following numbers can be expected for hashing individual whitespace-delimited words and newline-delimited lines: + +| Library | Shorter Words | Longer Lines | +| ---------------------- | -------------: | --------------: | +| `std::hash` | 0.43 GiB/s | 3.74 GiB/s | +| `xxh3::xxh3_64` | 1.08 GiB/s | 9.48 GiB/s | +| `aHash::hash_one` | 1.23 GiB/s | 8.61 GiB/s | +| `gxhash::gxhash64` | __2.68 GiB/s__ | 10.81 GiB/s | +| `stringzilla::hash` | 1.84 GiB/s | __11.23 GiB/s__ | +| | | | +| `blake3::hash` | 0.10 GiB/s | 1.97 GiB/s | +| `stringzilla::bytesum` | 2.16 GiB/s | 11.65 GiB/s | + +> Blake3 and byte-level summation are provided as a reference for expected lower and upper bounds. +> Blake3 is a cryptographic hash function and is obliged to provide a certain level of security, which comes at a cost. +> Byte-level summation is a simple operation, that is still sometimes used in practice, and is expected to be the fastest. + +In larger systems, however, we often need the ability to incrementally hash the data. +This is especially important in distributed systems, where the data is too large to fit into memory at once. + +| Library | Shorter Words | Longer Lines | +| -------------------------- | -------------: | -------------: | +| `std::hash::DefaultHasher` | 0.51 GiB/s | 3.92 GiB/s | +| `aHash::AHasher` | __1.30 GiB/s__ | __8.56 GiB/s__ | +| `stringzilla::HashState` | 0.89 GiB/s | 6.39 GiB/s | + +## Substring & Character-Set Search Benchmarks Substring search is one of the most common operations in text processing, and one of the slowest. -StringZilla was designed to supersede LibC and implement those core operations in CPU-friendly manner, using branchless operations, SWAR, and SIMD assembly instructions. -Notably, Rust has a `memchr` crate that provides a similar functionality, and it's used in many popular libraries. -This repository provides basic benchmarking scripts for comparing the throughput of [`stringzilla`](https://github.com/ashvardanian/StringZilla) and [`memchr`](https://github.com/BurntSushi/memchr). -For normal order and reverse order search, over ASCII and UTF8 input data, the following numbers can be expected. - -| | ASCII ⏩ | ASCII ⏪ | UTF8 ⏩ | UTF8 ⏪ | -| ------------- | --------------: | --------------: | -------------: | --------------: | -| Intel: | | | | | -| `memchr` | 5.89 GB/s | 1.08 GB/s | 8.73 GB/s | 3.35 GB/s | -| `stringzilla` | __8.37__ GB/s | __8.21__ GB/s | __11.21__ GB/s | __11.20__ GB/s | -| Arm: | | | | | -| `memchr` | 6.38 GB/s | 1.12 GB/s | __13.20__ GB/s | 3.56 GB/s | -| `stringzilla` | __6.56__ GB/s | __5.56__ GB/s | 9.41 GB/s | __8.17__ GB/s | -| | | | | | -| Average | __1.2x__ faster | __6.2x__ faster | - | __2.8x__ faster | - - -> For Intel the benchmark was run on AWS `r7iz` instances with Sapphire Rapids cores. -> For Arm the benchmark was run on AWS `r7g` instances with Graviton 3 cores. -> The ⏩ signifies forward search, and ⏪ signifies reverse order search. -> At the time of writing, the latest versions of `memchr` and `stringzilla` were used - 2.7.1 and 3.3.0, respectively. +Most of the time, programmers don't think about replacing the `str::find` method, as it's already expected to be optimized. +In many languages it's offloaded to the C standard library [`memmem`](https://man7.org/linux/man-pages/man3/memmem.3.html) or [`strstr`](https://en.cppreference.com/w/c/string/byte/strstr) for NULL-terminated strings. +The C standard library is, however, also implemented by humans, and a better solution can be created. + +| Library | Shorter Words | Longer Lines | +| -------------------- | --------------: | --------------: | +| `std::str::find` | 9.48 GiB/s | 10.88 GiB/s | +| `memmem::find` | 9.51 GiB/s | 10.83 GiB/s | +| `stringzilla::find` | __10.45 GiB/s__ | __10.89 GiB/s__ | +| | | | +| `std::str::rfind` | 2.96 GiB/s | 3.65 GiB/s | +| `memmem::rfind` | 2.95 GiB/s | 3.71 GiB/s | +| `stringzilla::rfind` | __9.78 GiB/s__ | __10.43 GiB/s__ | + +> Higher-throughput evaluation with `memmem` is possible, if the "matcher" object is reused to iterate through the string instead of constructing a new one for each search. + +Similarly, one can search a string for a set of characters. +StringWa.rs takes a few representative examples of various character sets that appear in real parsing or string validation tasks: + +- tabulation characters, like `\n\r\v\f`; +- HTML and XML markup characters, like `&'\"=[]`; +- numeric characters, like `0123456789`. + +It's common in such cases, to pre-construct some library-specific filter-object or Finite State Machine (FSM) to search for a set of characters. +Once that object is constructed, all of it's inclusions in each token (word or line) are counted. +Current numbers should look like this: + +| Library | Shorter Words | Longer Lines | +| --------------------------- | -------------: | -------------: | +| `bstr::iter` | 0.26 GiB/s | 0.25 GiB/s | +| `regex::find_iter` | 0.23 GiB/s | 5.22 GiB/s | +| `aho_corasick::find_iter` | 0.41 GiB/s | 0.50 GiB/s | +| `stringzilla::find_byteset` | __1.61 GiB/s__ | __8.17 GiB/s__ | + +## Strings Sorting & Intersections Benchmarks + +Rust has several Dataframe libraries, DBMS and Search engines that heavily rely on string sorting and intersections. +Those operations mostly are implemented using conventional algorithms: + +- Comparison-based Quicksort or Mergesort for sorting. +- Hash-based or Tree-based algorithms for intersections. + +Assuming the comparisons can be accelerated with SIMD and so can be the hash functions, StringZilla could already provide a performance boost in such applications, but starting with v4 it also provides specialized algorithms for sorting and intersections. +Those are directly compatible with arbitrary string-comparable collection types with a support of an indexed access to the elements. + +| Library | Shorter Words | Longer Lines | +| ------------------------------------------- | -----------------: | ----------------: | +| `std::sort_unstable_by_key` | 54.35 Melem/s | 57.70 Melem/s | +| `rayon::par_sort_unstable_by_key` on 1 vCPU | 47.08 Melem/s | 50.35 Melem/s | +| `arrow::lexsort_to_indices` | 122.20 Melem/s | __84.73 Melem/s__ | +| `stringzilla::argsort_permutation` | __182.88 Melem/s__ | 74.64 Melem/s | + +## Random Generation & Lookup Tables + +Some of the most common operations in data processing are random generation and lookup tables. +That's true not only for strings but for any data type, and StringZilla has been extensively used in Image Processing and Bioinformatics for those purposes. + +## String Edit Distance Benchmarks + +Edit Distance calculation is a common component of Search Engines, Data Cleaning, and Natural Language Processing, as well as in Bioinformatics. +It's a computationally expensive operation, generally implemented using dynamic programming, with a quadratic time complexity upper bound. ## Replicating the Results @@ -31,21 +128,40 @@ Before running benchmarks, you can test your Rust environment running: ```bash cargo install cargo-criterion --locked -HAYSTACK_PATH=README.md cargo criterion --jobs 8 +``` + +Wars always take long, and so do these benchmarks. +Every one of them includes a few seconds of a warm-up phase to ensure that the CPU caches are filled and the results are not affected by cold start or SIMD-related frequency scaling. +Each of them accepts a few environment variables to control the dataset, the tokenization, and the error bounds. +You can log those by printing file-level documentation using `awk` on Linux: + +```bash +awk '/^\/\/!/ { print } !/^\/\/!/ { exit }' bench_find.rs +``` + +Commonly used environment variables are: + +- `STRINGWARS_DATASET` - the path to the textual dataset file. +- `STRINGWARS_TOKENS` - the tokenization mode: `file`, `lines`, or `words`. +- `STRINGWARS_ERROR_BOUND` - the maximum allowed error in the Levenshtein distance. + +Here is an example of a common benchmark run on a Unix-like system: + +```bash +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_hash bench_hash --jobs 8 ``` On Windows using PowerShell you'd need to set the environment variable differently: ```powershell -$env:HAYSTACK_PATH="README.md" +$env:STRINGWARS_DATASET="README.md" cargo criterion --jobs 8 ``` -As part of the benchmark, the input "haystack" file is whitespace-tokenized into an array of strings. -In every benchmark iteration, a new "needle" is taken from that array of tokens. -All inclusions of that token in the haystack are counted, and the throughput is calculated. -This generally results in very stable and predictable results. -The benchmark also includes a warm-up, to ensure that the CPU caches are filled and the results are not affected by cold start or SIMD-related frequency scaling. +## Datasets ### ASCII Corpus @@ -54,7 +170,7 @@ It's 124 MB in size, 1'000'000 lines long, and contains 8'388'608 tokens of mean ```bash wget --no-clobber -O leipzig1M.txt https://introcs.cs.princeton.edu/python/42sort/leipzig1m.txt -HAYSTACK_PATH=leipzig1M.txt cargo criterion --jobs 8 +STRINGWARS_DATASET=leipzig1M.txt cargo criterion --jobs 8 ``` ### UTF8 Corpus @@ -66,5 +182,5 @@ To download, unpack, and run the benchmarks, execute the following bash script i ```bash wget --no-clobber -O xlsum.csv.gz https://github.com/ashvardanian/xl-sum/releases/download/v1.0.0/xlsum.csv.gz gzip -d xlsum.csv.gz -HAYSTACK_PATH=xlsum.csv cargo criterion --jobs 8 +STRINGWARS_DATASET=xlsum.csv cargo criterion --jobs 8 ``` diff --git a/bench.rs b/bench.rs deleted file mode 100644 index 53ec131..0000000 --- a/bench.rs +++ /dev/null @@ -1,124 +0,0 @@ -use criterion::{criterion_group, criterion_main, Criterion, Throughput}; -use std::env; -use std::fs; - -use memchr::memmem; -use stringzilla::StringZilla; - -fn configure_bench() -> Criterion { - Criterion::default() - .sample_size(1000) // Test this many needles. - .warm_up_time(std::time::Duration::from_secs(10)) // Let the CPU frequencies settle. - .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time. -} - -fn benchmarks(c: &mut Criterion) { - // Get the haystack path from the environment variable. - let haystack_path = - env::var("HAYSTACK_PATH").expect("HAYSTACK_PATH environment variable not set"); - let haystack_content = fs::read_to_string(&haystack_path).expect("Could not read haystack"); - - // Tokenize the haystack content by white space. - let needles: Vec<&str> = haystack_content.split_whitespace().collect(); - if needles.is_empty() { - panic!("No tokens found in the haystack."); - } - - let haystack = haystack_content.as_bytes(); - let haystack_length = haystack.len(); - - // Benchmarks for forward search - let mut g = c.benchmark_group("search-forward"); - g.throughput(Throughput::Bytes(haystack_length as u64)); - perform_forward_benchmarks(&mut g, &needles, haystack); - g.finish(); - - // Benchmarks for reverse search - let mut g = c.benchmark_group("search-reverse"); - g.throughput(Throughput::Bytes(haystack_length as u64)); - perform_reverse_benchmarks(&mut g, &needles, haystack); - g.finish(); -} - -fn perform_forward_benchmarks( - g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - needles: &[&str], - haystack: &[u8], -) { - // Benchmark for StringZilla forward search - let mut token_index: usize = 0; - g.bench_function("stringzilla::find", |b| { - b.iter(|| { - let token = needles[token_index]; - let token_bytes = token.as_bytes(); - let mut pos: usize = 0; - while let Some(found) = (&haystack[pos..]).sz_find(token_bytes) { - pos += found + token_bytes.len(); - } - token_index = (token_index + 1) % needles.len(); - }) - }); - - // Benchmark for memchr (forward search) - let mut token_index: usize = 0; // Reset token index for the next benchmark - g.bench_function("memmem::find", |b| { - b.iter(|| { - let token = needles[token_index]; - let token_bytes = token.as_bytes(); - let mut pos: usize = 0; - while let Some(found) = memmem::find(&haystack[pos..], token_bytes) { - pos += found + token_bytes.len(); - } - token_index = (token_index + 1) % needles.len(); - }) - }); -} - -fn perform_reverse_benchmarks( - g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, - needles: &[&str], - haystack: &[u8], -) { - // Benchmark for StringZilla reverse search - let mut token_index: usize = 0; - g.bench_function("stringzilla::rfind", |b| { - b.iter(|| { - let token = needles[token_index]; - let token_bytes = token.as_bytes(); - let mut pos: Option = Some(haystack.len()); - while let Some(end) = pos { - if let Some(found) = (&haystack[..end]).sz_rfind(token_bytes) { - pos = Some(found); // Update position to the start of the found token for the next search. - } else { - break; // No more occurrences found. - } - } - token_index = (token_index + 1) % needles.len(); - }) - }); - - // Benchmark for memchr reverse search - let mut token_index: usize = 0; - g.bench_function("memmem::rfind", |b| { - b.iter(|| { - let token = needles[token_index]; - let token_bytes = token.as_bytes(); - let mut pos: Option = Some(haystack.len()); - while let Some(end) = pos { - if let Some(found) = memmem::rfind(&haystack[..end], token_bytes) { - pos = Some(found); // Update position to the start of the found token for the next search. - } else { - break; // No more occurrences found. - } - } - token_index = (token_index + 1) % needles.len(); - }) - }); -} - -criterion_group! { - name = sz_bench; - config = configure_bench(); - targets = benchmarks -} -criterion_main!(sz_bench); diff --git a/bench_feature.rs b/bench_feature.rs new file mode 100644 index 0000000..bef3368 --- /dev/null +++ b/bench_feature.rs @@ -0,0 +1,51 @@ +use std::env; +use std::fs; + +use criterion::{criterion_group, criterion_main, Criterion, Throughput}; + +use memchr::memmem; +use stringzilla::StringZilla; + +fn configure_bench() -> Criterion { + Criterion::default() + .sample_size(1000) // Test this many needles. + .warm_up_time(std::time::Duration::from_secs(10)) // Let the CPU frequencies settle. + .measurement_time(std::time::Duration::from_secs(120)) // Actual measurement time. +} + +fn bench_tfidf(c: &mut Criterion) { + // Get the haystack path from the environment variable. + let dataset_path = + env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); + let haystack_content = fs::read_to_string(&dataset_path).expect("Could not read haystack"); + + // Tokenize the haystack content by white space. + let needles: Vec<&str> = haystack_content.split_whitespace().collect(); + if needles.is_empty() { + panic!("No tokens found in the haystack."); + } + + let haystack = haystack_content.as_bytes(); + let haystack_length = haystack.len(); + + // Benchmarks for forward search + let mut g = c.benchmark_group("search-forward"); + g.throughput(Throughput::Bytes(haystack_length as u64)); + perform_forward_benchmarks(&mut g, &needles, haystack); + g.finish(); + + // Benchmarks for reverse search + let mut g = c.benchmark_group("search-reverse"); + g.throughput(Throughput::Bytes(haystack_length as u64)); + perform_reverse_benchmarks(&mut g, &needles, haystack); + g.finish(); +} + +... + +criterion_group! { + name = bench_tfidf_group; + config = configure_bench(); + targets = bench_tfidf +} +criterion_main!(bench_tfidf_group); diff --git a/bench_find.rs b/bench_find.rs new file mode 100644 index 0000000..23f2ce0 --- /dev/null +++ b/bench_find.rs @@ -0,0 +1,349 @@ +#![doc = r#" +# StringWa.rs: Substring & Character-Set Search Benchmarks + +This file benchmarks the forward and backward exact substring search functionality provided by +the StringZilla library and the memchr crate. The input file is treated as a haystack and all +of its tokens as needles. The throughput numbers are reported in Gigabytes per Second and for +any sampled token - all of its inclusions in a string are located. +Be warned, for large files, it may take a while! + +The input file is treated as a haystack and all of its tokens as needles. For substring searches, +each occurrence is located. For byteset searches, three separate operations are performed per token, +looking for: + +- any of "\n\r\v\f" - the 4 tabulation characters +- any of "&'\"=[]" - the 9 HTML-related characters +- any of "0123456789" - the 10 numeric characters + +## Usage Examples + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. + +To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: + +```sh +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_find bench_find --jobs 8 +``` +"#] +use std::env; +use std::error::Error; +use std::fs; +use std::time::Duration; + +use criterion::{black_box, Criterion, Throughput}; + +use aho_corasick::AhoCorasick; +use bstr::ByteSlice; +use memchr::memmem; +use regex::bytes::Regex; +use stringzilla::sz; + +/// Loads the dataset from the file specified by the `STRINGWARS_DATASET` environment variable. +pub fn load_dataset() -> Result, Box> { + let dataset_path = env::var("STRINGWARS_DATASET") + .map_err(|_| "STRINGWARS_DATASET environment variable not set")?; + let content = fs::read(&dataset_path)?; + Ok(content) +} + +/// Tokenizes the given haystack based on the `STRINGWARS_TOKENS` environment variable. +/// Supported modes: "lines", "words", and "file". +pub fn tokenize<'a>(haystack: &'a [u8]) -> Result, Box> { + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); + let tokens = match mode.as_str() { + "lines" => haystack + .split(|&c| c == b'\n') + .filter(|token| !token.is_empty()) + .collect(), + "words" => haystack + .split(|&c| c == b'\n' || c == b' ') + .filter(|token| !token.is_empty()) + .collect(), + "file" => vec![haystack], + other => { + return Err(format!( + "Unknown STRINGWARS_TOKENS: {}. Use 'lines', 'words', or 'file'.", + other + ) + .into()) + } + }; + Ok(tokens) +} + +/// Benchmarks forward substring search using "StringZilla", "MemMem", and standard strings. +fn bench_substring_forward( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + haystack: &[u8], + needles: &[&[u8]], +) { + g.throughput(Throughput::Bytes(haystack.len() as u64)); + + // Benchmark for StringZilla forward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); + g.bench_function("sz::find", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let mut pos: usize = 0; + while let Some(found) = sz::find(&haystack[pos..], token) { + pos += found + token.len(); + } + }) + }); + + // Benchmark for `memmem::find` forward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); + g.bench_function("memmem::find", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let mut pos: usize = 0; + while let Some(found) = memmem::find(&haystack[pos..], token) { + pos += found + token.len(); + } + }) + }); + + // Benchmark for default `std::str::find` forward search. + let mut tokens = needles.iter().cycle(); + g.bench_function("std::str::find", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let mut pos = 0; + while let Some(found) = haystack[pos..].find(token) { + pos += found + token.len(); + } + }) + }); + + // Benchmark for `memmem::find_iter` forward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); + g.bench_function("memmem::find_iter", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + for match_ in memmem::find_iter(haystack, token) { + black_box(match_); + } + }) + }); +} + +/// Benchmarks backward substring search using "StringZilla", "MemMem", and standard strings. +fn bench_substring_backward( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + haystack: &[u8], + needles: &[&[u8]], +) { + g.throughput(Throughput::Bytes(haystack.len() as u64)); + + // Benchmark for StringZilla backward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); + g.bench_function("sz::rfind", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let mut pos: Option = Some(haystack.len()); + while let Some(end) = pos { + if let Some(found) = sz::rfind(&haystack[..end], token) { + pos = Some(found); + } else { + break; + } + } + }) + }); + + // Benchmark for `memmem::rfind` backward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); + g.bench_function("memmem::rfind", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let mut pos: Option = Some(haystack.len()); + while let Some(end) = pos { + if let Some(found) = memmem::rfind(&haystack[..end], token) { + pos = Some(found); + } else { + break; + } + } + }) + }); + + // Benchmark for default `std::str::rfind` backward search. + let mut tokens = needles.iter().cycle(); + g.bench_function("std::str::rfind", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + let mut pos: Option = Some(haystack.len()); + while let Some(end) = pos { + if let Some(found) = haystack[..end].rfind(token) { + pos = Some(found); + } else { + break; + } + } + }) + }); + + // Benchmark for `memmem::rfind_iter` forward search using a cycle iterator. + let mut tokens = needles.iter().cycle(); + g.bench_function("memmem::rfind_iter", |b| { + b.iter(|| { + let token = black_box(*tokens.next().unwrap()); + for match_ in memmem::rfind_iter(haystack, token) { + black_box(match_); + } + }) + }); +} + +/// Benchmarks byteset search using "StringZilla", "bstr", "RegEx", and "AhoCorasick" +fn bench_byteset_forward( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + haystack: &[u8], + needles: &[&[u8]], +) { + g.throughput(Throughput::Bytes(3 * haystack.len() as u64)); + + // Define the three bytesets we will analyze. + const BYTES_TABS: &[u8] = b"\n\r\x0B\x0C"; + const BYTES_HTML: &[u8] = b"&'\"=[]"; + const BYTES_DIGITS: &[u8] = b"0123456789"; + + // Benchmark for StringZilla forward search using a cycle iterator. + let sz_tabs = sz::Byteset::from(BYTES_TABS); + let sz_html = sz::Byteset::from(BYTES_HTML); + let sz_digits = sz::Byteset::from(BYTES_DIGITS); + g.bench_function("sz::find_byteset", |b| { + b.iter(|| { + for token in needles.iter() { + let mut pos: usize = 0; + while let Some(found) = sz::find_byteset(&token[pos..], sz_tabs) { + pos += found + 1; + } + pos = 0; + while let Some(found) = sz::find_byteset(&token[pos..], sz_html) { + pos += found + 1; + } + pos = 0; + while let Some(found) = sz::find_byteset(&token[pos..], sz_digits) { + pos += found + 1; + } + } + }) + }); + + // Benchmark for bstr's byteset search. + g.bench_function("bstr::iter", |b| { + b.iter(|| { + for token in needles.iter() { + let mut pos: usize = 0; + // Inline search for `BYTES_TABS`. + while let Some(found) = token[pos..].iter().position(|&c| BYTES_TABS.contains(&c)) { + pos += found + 1; + } + pos = 0; + // Inline search for `BYTES_HTML`. + while let Some(found) = token[pos..].iter().position(|&c| BYTES_HTML.contains(&c)) { + pos += found + 1; + } + pos = 0; + // Inline search for `BYTES_DIGITS`. + while let Some(found) = token[pos..].iter().position(|&c| BYTES_DIGITS.contains(&c)) + { + pos += found + 1; + } + } + }) + }); + + // Benchmark for Regex-based byteset search. + let re_tabs = Regex::new("[\n\r\x0B\x0C]").unwrap(); + let re_html = Regex::new("[&'\"=\\[\\]]").unwrap(); + let re_digits = Regex::new("[0-9]").unwrap(); + g.bench_function("regex::find_iter", |b| { + b.iter(|| { + for token in needles.iter() { + black_box(re_tabs.find_iter(token.as_bytes()).count()); + black_box(re_html.find_iter(token.as_bytes()).count()); + black_box(re_digits.find_iter(token.as_bytes()).count()); + } + }) + }); + + // Benchmark for Aho–Corasick-based byteset search. + let ac_tabs = AhoCorasick::new( + &BYTES_TABS + .iter() + .map(|&b| (b as char).to_string()) + .collect::>(), + ) + .expect("failed to create AhoCorasick FSA"); + let ac_html = AhoCorasick::new( + &BYTES_HTML + .iter() + .map(|&b| (b as char).to_string()) + .collect::>(), + ) + .expect("failed to create AhoCorasick FSA"); + let ac_digits = AhoCorasick::new( + &BYTES_DIGITS + .iter() + .map(|&b| (b as char).to_string()) + .collect::>(), + ) + .expect("failed to create AhoCorasick FSA"); + g.bench_function("aho_corasick::find_iter", |b| { + b.iter(|| { + for token in needles.iter() { + black_box(ac_tabs.find_iter(token).count()); + black_box(ac_html.find_iter(token).count()); + black_box(ac_digits.find_iter(token).count()); + } + }) + }); +} + +fn main() { + // Log StringZilla metadata + let v = sz::version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz::dynamic_dispatch()); + println!("- capabilities: {}", sz::capabilities().as_str()); + + // Load the dataset defined by the environment variables, and panic if the content is missing + let haystack = load_dataset().unwrap(); + let needles = tokenize(&haystack).unwrap(); + if needles.is_empty() { + panic!("No tokens found in the dataset."); + } + + // Setup the default durations + let mut criterion = Criterion::default() + .sample_size(10) // Each loop scans the whole dataset, but this can't be under 10 + .warm_up_time(Duration::from_secs(3)) // Let the CPU frequencies settle. + .measurement_time(Duration::from_secs(20)); // Actual measurement time. + + // Benchmarks for forward search + let mut group = criterion.benchmark_group("substring-forward"); + bench_substring_forward(&mut group, &haystack, &needles); + group.finish(); + + // Benchmarks for backward search + let mut group = criterion.benchmark_group("substring-backward"); + bench_substring_backward(&mut group, &haystack, &needles); + group.finish(); + + // Benchmarks for byteset search + let mut group = criterion.benchmark_group("byteset-forward"); + bench_byteset_forward(&mut group, &haystack, &needles); + group.finish(); + + criterion.final_summary(); +} diff --git a/bench_hash.rs b/bench_hash.rs new file mode 100644 index 0000000..a5e8a8f --- /dev/null +++ b/bench_hash.rs @@ -0,0 +1,243 @@ +#![doc = r#" +# StringWa.rs: String Hashing Benchmarks + +This file contains benchmarks for various Rust hashing libraries using Criterion, +treating the inputs as binary strings without any UTF-8 validity constrains. +For accurate stats aggregation, on each iteration, the whole file is scanned. +Be warned, for large files, it may take a while! + +The benchmarks compare the performance of different hash functions including: + +- Standard `Hash` implementation +- StringZilla (`bytesum`, `hash`, and incremental `hash` function variants) +- aHash (both incremental and single-entry variants) +- xxHash (xxh3) through the third-party `xxhash-rust` crate +- gxhash (gxhash64) +- Blake3 (the only cryptographic hash in the comparison, for reference) + +## Usage Examples + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. + - `file`: Process the entire file as a single token. + +To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: + +```sh +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_hash bench_hash --jobs 8 +``` + +For `gxhash`, ensure that your CPU supports the required AES and SSE2 instructions. +"#] +use std::env; +use std::error::Error; +use std::fs; + +use criterion::{black_box, Criterion, Throughput}; + +use ahash::{AHasher, RandomState}; +use blake3; +use gxhash; +use std::hash::{BuildHasher, Hasher}; +use stringzilla::sz; +use xxhash_rust::xxh3::xxh3_64; + +/// Loads the dataset from the file specified by the `STRINGWARS_DATASET` environment variable. +pub fn load_dataset() -> Result, Box> { + let dataset_path = env::var("STRINGWARS_DATASET") + .map_err(|_| "STRINGWARS_DATASET environment variable not set")?; + let content = fs::read(&dataset_path)?; + Ok(content) +} + +/// Tokenizes the given haystack based on the `STRINGWARS_TOKENS` environment variable. +/// Supported modes: "lines", "words", and "file". +pub fn tokenize<'a>(haystack: &'a [u8]) -> Result, Box> { + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); + let tokens = match mode.as_str() { + "lines" => haystack.split(|&c| c == b'\n').collect(), + "words" => haystack.split(|&c| c == b'\n' || c == b' ').collect(), + "file" => vec![haystack], + other => { + return Err(format!( + "Unknown STRINGWARS_TOKENS: {}. Use 'lines', 'words', or 'file'.", + other + ) + .into()) + } + }; + Ok(tokens) +} + +/// Benchmarks stateless hashes seeing the whole input at once +fn bench_stateless( + group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + tokens: &[&[u8]], +) { + // Calculate total bytes processed for throughput reporting + let total_bytes: usize = tokens.iter().map(|u| u.len()).sum(); + group.throughput(Throughput::Bytes(total_bytes as u64)); + + // Benchmark: StringZilla `bytesum` + group.bench_function("stringzilla::bytesum", |b| { + b.iter(|| { + for token in tokens { + // Using black_box to prevent compiler optimizations. + let _hash = sz::bytesum(black_box(token)); + } + }) + }); + + // Benchmark: StringZilla `hash` + group.bench_function("stringzilla::hash", |b| { + b.iter(|| { + for token in tokens { + let _hash = sz::hash(black_box(token)); + } + }) + }); + + // Benchmark: SipHash via `std::hash::BuildHasher` + group.bench_function("std::hash::BuildHasher", |b| { + let std_builder = std::collections::hash_map::RandomState::new(); + b.iter(|| { + for token in tokens { + let mut hasher = std_builder.build_hasher(); + hasher.write(token); + let _hash = black_box(hasher.finish()); + } + }) + }); + + // Benchmark: aHash (`hash_one`) + group.bench_function("aHash::hash_one", |b| { + let hash_builder = RandomState::with_seed(42); + b.iter(|| { + for token in tokens { + let _hash = black_box(hash_builder.hash_one(token)); + } + }) + }); + + // Benchmark: xxHash (`xxh3`) + group.bench_function("xxh3::xxh3_64", |b| { + b.iter(|| { + for token in tokens { + let _hash = black_box(xxh3_64(token)); + } + }) + }); + + // Benchmark: gxhash + group.bench_function("gxhash::gxhash64", |b| { + b.iter(|| { + for token in tokens { + let _hash = black_box(gxhash::gxhash64(token, 42)); + } + }) + }); + + // Benchmark: Blake3 - should be by far the slowest, as it's a cryptographic hash. + group.bench_function("blake3", |b| { + b.iter(|| { + for token in tokens { + let _hash = black_box(blake3::hash(token)); + } + }) + }); +} + +/// Benchmarks stateful hashes seeing one slice at a time +fn bench_stateful( + group: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + tokens: &[&[u8]], +) { + // Calculate total bytes processed for throughput reporting + let total_bytes: usize = tokens.iter().map(|u| u.len()).sum(); + group.throughput(Throughput::Bytes(total_bytes as u64)); + + // Benchmark: StringZilla `bytesum` + group.bench_function("stringzilla::bytesum", |b| { + b.iter(|| { + let mut aggregate = 0u64; + for token in tokens { + aggregate += sz::bytesum(token); + } + black_box(aggregate); + }) + }); + + // Benchmark: StringZilla `hash` + group.bench_function("stringzilla::HashState", |b| { + b.iter(|| { + let mut aggregate = sz::HashState::new(0); + for token in tokens { + aggregate.stream(token); + } + black_box(aggregate.fold()); + }) + }); + + // Benchmark: SipHash via `std::hash::BuildHasher` + group.bench_function("std::hash::BuildHasher", |b| { + let std_builder = std::collections::hash_map::RandomState::new(); + b.iter(|| { + let mut aggregate = std_builder.build_hasher(); + for token in tokens { + aggregate.write(token); + } + black_box(aggregate.finish()); + }) + }); + + // Benchmark: aHash (`hash_one`) + group.bench_function("aHash::AHasher", |b| { + b.iter(|| { + let mut aggregate = AHasher::default(); + for token in tokens { + aggregate.write(token); + } + black_box(aggregate.finish()); + }) + }); +} + +fn main() { + // Log StringZilla metadata + let v = sz::version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz::dynamic_dispatch()); + println!("- capabilities: {}", sz::capabilities().as_str()); + + // Load the dataset defined by the environment variables, and panic if the content is missing + let dataset = load_dataset().unwrap(); + let tokens = tokenize(&dataset).unwrap(); + if tokens.is_empty() { + panic!("No tokens found in the dataset."); + } + + let mut criterion = Criterion::default() + .configure_from_args() + .sample_size(30) // Number of samples to collect. + .warm_up_time(std::time::Duration::from_secs(5)) // Let CPU frequencies settle. + .measurement_time(std::time::Duration::from_secs(10)); // Actual measurement time. + + // Profile hash functions that see the whole input at once + let mut group = criterion.benchmark_group("stateful"); + bench_stateful(&mut group, &tokens); + group.finish(); + + // Profile incremental hash functions that see only a slice of data at a time + let mut group = criterion.benchmark_group("stateless"); + bench_stateless(&mut group, &tokens); + group.finish(); + + criterion.final_summary(); +} diff --git a/bench_memory.rs b/bench_memory.rs new file mode 100644 index 0000000..b278c8a --- /dev/null +++ b/bench_memory.rs @@ -0,0 +1,201 @@ +#![doc = r#" +# StringWa.rs: Low-level Memory-related Benchmarks + +This file benchmarks low-level memory operations. The input file is treated as a collection +of size-representative tokens and for every token the following operations are benchmarked: + +- case inversion using Lookup Table Transforms (LUT), common in image processing +- memory obfuscation using Pseudo-Random Number Generators (PRNG), common in sensitive apps + +## Usage Examples + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. + +To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: + +```sh +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_memory bench_memory --jobs 8 +``` +"#] +use std::env; +use std::error::Error; +use std::fs; +use std::time::Duration; + +use criterion::{black_box, Criterion, Throughput}; + +use getrandom; +use rand; +use rand::{RngCore, SeedableRng}; +use rand_chacha; +use rand_xoshiro; +use stringzilla::sz; +use zeroize::Zeroize; + +/// Loads the dataset from the file specified by the `STRINGWARS_DATASET` environment variable. +pub fn load_dataset() -> Result, Box> { + let dataset_path = env::var("STRINGWARS_DATASET") + .map_err(|_| "STRINGWARS_DATASET environment variable not set")?; + let content = fs::read(&dataset_path)?; + Ok(content) +} + +/// Tokenizes the given haystack based on the `STRINGWARS_TOKENS` environment variable. +/// Supported modes: "lines", "words", and "file". +pub fn tokenize<'a>(haystack: &'a mut [u8]) -> Result, Box> { + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); + let tokens = match mode.as_str() { + "lines" => haystack.split_mut(|&c| c == b'\n').collect(), + "words" => haystack.split_mut(|&c| c == b'\n' || c == b' ').collect(), + "file" => vec![haystack], + other => { + return Err(format!( + "Unknown STRINGWARS_TOKENS: {}. Use 'lines', 'words', or 'file'.", + other + ) + .into()) + } + }; + Ok(tokens) +} + +fn bench_lookup_table( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + tokens: &mut [&mut [u8]], +) { + // Calculate total bytes processed for throughput reporting + let total_bytes: usize = tokens.iter().map(|u| u.len()).sum(); + g.throughput(Throughput::Bytes(total_bytes as u64)); + + // Benchmark for StringZilla forward search using a cycle iterator. + let mut lookup_invert_case: [u8; 256] = core::array::from_fn(|i| i as u8); + for (upper, lower) in ('A'..='Z').zip('a'..='z') { + lookup_invert_case[upper as usize] = lower as u8; + } + for (upper, lower) in ('A'..='Z').zip('a'..='z') { + lookup_invert_case[lower as usize] = upper as u8; + } + + // Benchmark using StringZilla's `lookup_inplace`. + g.bench_function("sz::lookup_inplace", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + sz::lookup_inplace(&mut *token, lookup_invert_case); + black_box(token); + } + }) + }); + + // Benchmark a plain serial mapping using the same lookup table. + g.bench_function("serial", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + for byte in token.iter_mut() { + *byte = lookup_invert_case[*byte as usize]; + } + black_box(&token); + } + }) + }); +} + +fn bench_generate_random( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + tokens: &mut [&mut [u8]], +) { + // Calculate total bytes processed for throughput reporting + let total_bytes: usize = tokens.iter().map(|u| u.len()).sum(); + g.throughput(Throughput::Bytes(total_bytes as u64)); + + // Benchmark for StringZilla AES-based PRNG + g.bench_function("sz::fill_random", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + sz::fill_random(&mut *token, 0) + } + }) + }); + + // Benchmark using zeroize to obfuscate (zero out) the buffer. + g.bench_function("zeroize::zeroize", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + token.zeroize(); + black_box(&token); + } + }) + }); + + // Benchmark using `getrandom` to randomize the buffer via the OS. + g.bench_function("getrandom::fill", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + getrandom::fill(&mut *token).expect("getrandom failed"); + black_box(&token); + } + }) + }); + + // Benchmark using `rand_chacha::ChaCha20Rng`. + g.bench_function("rand_chacha::ChaCha20Rng", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + let mut rng = rand_chacha::ChaCha20Rng::from_seed([0u8; 32]); + rng.fill_bytes(&mut *token); + black_box(&token); + } + }) + }); + + // Benchmark using `rand_xoshiro::Xoshiro128Plus`. + g.bench_function("rand_xoshiro::Xoshiro128Plus", |b| { + b.iter(|| { + for token in tokens.iter_mut() { + let mut rng = rand_xoshiro::Xoshiro128Plus::from_seed([0u8; 16]); + rng.fill_bytes(&mut *token); + black_box(&token); + } + }) + }); +} + +fn main() { + // Log StringZilla metadata + let v = sz::version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz::dynamic_dispatch()); + println!("- capabilities: {}", sz::capabilities().as_str()); + + // Load the dataset defined by the environment variables, and panic if the content is missing + let mut dataset = load_dataset().unwrap(); + let mut tokens = tokenize(&mut dataset).unwrap(); + if tokens.is_empty() { + panic!("No tokens found in the dataset."); + } + + // Setup the default durations + let mut criterion = Criterion::default() + .sample_size(10) // Each loop scans the whole dataset. + .warm_up_time(Duration::from_secs(1)) // Let the CPU frequencies settle. + .measurement_time(Duration::from_secs(20)); // Actual measurement time. + + // Benchmarks for lookup table transform + let mut group = criterion.benchmark_group("lookup-table"); + bench_lookup_table(&mut group, &mut tokens[..]); + group.finish(); + + // Benchmarks for random string generation + let mut group = criterion.benchmark_group("generate-random"); + bench_generate_random(&mut group, &mut tokens[..]); + group.finish(); + + criterion.final_summary(); +} diff --git a/bench_sequence.rs b/bench_sequence.rs new file mode 100644 index 0000000..9f1888b --- /dev/null +++ b/bench_sequence.rs @@ -0,0 +1,161 @@ +#![doc = r#"# StringWa.rs: String Sequence Operations Benchmarks + +This file benchmarks various libraries for processing string-identifiable collections. +Including sorting arrays of strings: + +- StringZilla's `sz::argsort_permutation` +- The standard library's `sort_unstable` +- Rayon's parallel `par_sort_unstable` + +Intersecting string collections, similar to "STRICT INNER JOIN" in SQL databases. + +## Usage Example + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. + +To run the benchmarks with the appropriate CPU features enabled, you can use the following commands: + +```sh +RUSTFLAGS="-C target-cpu=native" \ + RAYON_NUM_THREADS=1 \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_sequence bench_sequence --jobs 8 +``` +"#] + +use std::env; +use std::fs; +use std::sync::Arc; + +use criterion::{black_box, Criterion, SamplingMode}; + +use arrow::array::{ArrayRef, LargeStringArray}; +use arrow::compute::{lexsort_to_indices, SortColumn, SortOptions}; +use rayon::prelude::*; +use stringzilla::sz::{ + argsort_permutation as sz_argsort_permutation, + argsort_permutation_by as sz_argsort_permutation_by, +}; + +use stringzilla::sz::{ + // Pull some metadata logging functionality + capabilities as sz_capabilities, + dynamic_dispatch as sz_dynamic_dispatch, + version as sz_version, +}; + +fn log_stringzilla_metadata() { + let v = sz_version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz_dynamic_dispatch()); + println!("- capabilities: {}", sz_capabilities().as_str()); +} + +fn configure_bench() -> Criterion { + Criterion::default() + .sample_size(10) // Each loop processes the whole dataset. + .warm_up_time(std::time::Duration::from_secs(5)) // Let CPU frequencies settle. + .measurement_time(std::time::Duration::from_secs(10)) // Actual measurement time. +} + +fn load_data() -> Vec { + let dataset_path = + env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); + + let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); + let data: Vec = match mode.as_str() { + "lines" => content.lines().map(|line| line.to_string()).collect(), + "words" => content + .split_whitespace() + .map(|word| word.to_string()) + .collect(), + other => panic!( + "Unknown STRINGWARS_TOKENS: {}. Use 'lines' or 'words'.", + other + ), + }; + data +} + +fn bench_argsort(c: &mut Criterion) { + // Load the dataset once; each benchmark iteration will clone this unsorted data. + let unsorted = load_data(); + if unsorted.is_empty() { + panic!("No data found in dataset for sorting benchmark."); + } + + let mut group = c.benchmark_group("sorting"); + //? We have a very long benchmark, flat sampling is what we need. + //? https://bheisler.github.io/criterion.rs/book/user_guide/advanced_configuration.html#sampling-mode + group.sampling_mode(SamplingMode::Flat); + //? For comparison-based sorting algorithms, we can report throughput in terms of comparisons, + //? which is proportional to the number of elements in the array multiplied by the logarithm of + //? the number of elements. + let throughput = unsorted.len() as f64 * (unsorted.len() as f64).log2(); + group.throughput(criterion::Throughput::Elements(throughput as u64)); + + // Benchmark: StringZilla's argsort + group.bench_function("sz::argsort_permutation", |b| { + b.iter(|| { + let mut indices: Vec = (0..unsorted.len()).collect(); + sz_argsort_permutation(&unsorted, &mut indices).expect("StringZilla argsort failed"); + black_box(indices); + }) + }); + + // Benchmark: Apache Arrow's `lexsort_to_indices` + // https://arrow.apache.org/rust/arrow/compute/fn.lexsort.html + // https://arrow.apache.org/rust/arrow/compute/fn.lexsort_to_indices.html + // ! We can't use the conventional `StringArray` in most of our workloads, as it will + // ! overflow the 32-bit tape offset capacity and panic. + let array = Arc::new(LargeStringArray::from(unsorted.clone())) as ArrayRef; + group.bench_function("arrow::lexsort_to_indices", |b| { + b.iter(|| { + let column_to_sort = SortColumn { + values: array.clone(), + options: Some(SortOptions { + descending: false, + nulls_first: true, + }), + }; + match lexsort_to_indices(&[column_to_sort], None) { + Ok(indices) => black_box(indices), + Err(e) => panic!("Arrow lexsort failed: {:?}", e), + } + }) + }); + + // Benchmark: Standard library argsort using `sort_unstable_by_key` + group.bench_function("std::sort_unstable_by_key", |b| { + b.iter(|| { + let mut indices: Vec = (0..unsorted.len()).collect(); + indices.sort_unstable_by_key(|&i| &unsorted[i]); + black_box(&indices); + }) + }); + + // Benchmark: Parallel argsort using Rayon + group.bench_function("rayon::par_sort_unstable_by_key", |b| { + b.iter(|| { + let mut indices: Vec = (0..unsorted.len()).collect(); + indices.par_sort_unstable_by_key(|&i| &unsorted[i]); + black_box(&indices); + }) + }); + + group.finish(); +} + +fn main() { + log_stringzilla_metadata(); + let mut criterion = configure_bench(); + bench_argsort(&mut criterion); + criterion.final_summary(); +} diff --git a/bench_similarity.rs b/bench_similarity.rs new file mode 100644 index 0000000..ba58149 --- /dev/null +++ b/bench_similarity.rs @@ -0,0 +1,255 @@ +#![doc = r#"# StringWa.rs: String Similarity Benchmarks + +This file benchmarks different libraries implementing string alignment and edit +distance calculation, for both generic Levenshtein distances and the weighted +Needleman-Wunsch alignment scores used in Bioinformatics. + +The input file is tokenized into lines or words and each consecutive pair of tokens +is evaluated for similarity. As most algorithms have quadratic complexity and use +Dynamic Programming techniques, their throughput is evaluate in the number of CUPS, +or Cell Updates Per Second. + +## Usage Examples + +The benchmarks use two environment variables to control the input dataset and mode: + +- `STRINGWARS_DATASET`: Path to the input dataset file. +- `STRINGWARS_TOKENS`: Specifies how to interpret the input. Allowed values: + - `lines`: Process the dataset line by line. + - `words`: Process the dataset word by word. +- `STRINGWARS_ERROR_BOUND`: Maximum error bound, defined as an integer percent. + +```sh +RUSTFLAGS="-C target-cpu=native" \ + STRINGWARS_DATASET=README.md \ + STRINGWARS_ERROR_BOUND=15 \ + STRINGWARS_TOKENS=lines \ + cargo criterion --features bench_similarity bench_similarity --jobs 8 +``` +"#] +use std::env; +use std::fs; + +use criterion::Criterion; + +use rapidfuzz::distance::levenshtein; +use stringzilla::sz::{ + alignment_score as sz_alignment_score, // + levenshtein_distance as sz_levenshtein_distance, + levenshtein_distance_bounded as sz_levenshtein_distance_bounded, + levenshtein_distance_utf8 as sz_levenshtein_distance_utf8, + levenshtein_distance_utf8_bounded as sz_levenshtein_distance_utf8_bounded, + unary_substitution_costs as sz_unary_substitution_costs, +}; + +use stringzilla::sz::{ + // Pull some metadata logging functionality + capabilities as sz_capabilities, + dynamic_dispatch as sz_dynamic_dispatch, + version as sz_version, +}; + +fn log_stringzilla_metadata() { + let v = sz_version(); + println!("StringZilla v{}.{}.{}", v.major, v.minor, v.patch); + println!("- uses dynamic dispatch: {}", sz_dynamic_dispatch()); + println!("- capabilities: {}", sz_capabilities().as_str()); +} + +fn configure_bench() -> Criterion { + Criterion::default() + .sample_size(1000) + .warm_up_time(std::time::Duration::from_secs(10)) + .measurement_time(std::time::Duration::from_secs(120)) +} + +fn bench_levenshtein(c: &mut Criterion) { + let dataset_path = + env::var("STRINGWARS_DATASET").expect("STRINGWARS_DATASET environment variable not set"); + let mode = env::var("STRINGWARS_TOKENS").unwrap_or_else(|_| "lines".to_string()); + let content = fs::read_to_string(&dataset_path).expect("Could not read dataset"); + + let bound_percent = env::var("STRINGWARS_ERROR_BOUND") + .unwrap_or_else(|_| "15".to_string()) + .parse::() + .expect("STRINGWARS_ERROR_BOUND must be a number"); + + let max_pairs = env::var("STRINGWARS_MAX_PAIRS") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(100); + + let units: Vec<&str> = match mode.as_str() { + "words" => content.split_whitespace().collect(), + "lines" => content.lines().collect(), + other => panic!( + "Unknown STRINGWARS_TOKENS: {}. Use 'lines' or 'words'.", + other + ), + }; + + if units.len() < 2 { + panic!("Dataset must contain at least two items for comparisons."); + } + + let mut pairs: Vec<(&str, &str)> = units + .chunks(2) + .filter_map(|chunk| { + if chunk.len() == 2 { + Some((chunk[0], chunk[1])) + } else { + None + } + }) + .collect(); + + if pairs.is_empty() { + panic!("No pairs could be formed from the dataset."); + } + + if pairs.len() > max_pairs { + pairs.truncate(max_pairs); + } + + // In "unbounded" benchmarks we report the total number of Dynamic + // Programming (DP) matrix evaluated by the algorithm, aka "CUPS". + let mut g = c.benchmark_group("unbounded"); + g.throughput(Throughput::Bytes(haystack_length as u64)); + perform_unbounded_benchmarks(&mut g, &pairs, &pair_bounds); + g.finish(); + + // In case of "bounded" benchmarks, only one band of the DP matrix + // needs to be evaluated, so the throughput is computed differently. + let pair_bounds: Vec = pairs + .iter() + .map(|(a, b)| { + let max_len = a.len().max(b.len()); + ((max_len as u64 * bound_percent) / 100) as usize + }) + .collect(); +} + +fn perform_unbounded_benchmarks( + g: &mut criterion::BenchmarkGroup<'_, criterion::measurement::WallTime>, + pairs: &[(&str, &str)], + pair_bounds: &[usize], +) { + // StringZilla, bytes-based, unbounded + { + let mut pair_index = 0; + g.bench_function("sz::levenshtein_bytes_unbounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let _distance = a.sz_edit_distance(b_str.as_bytes()); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // StringZilla, bytes-based, bounded + { + let mut pair_index = 0; + g.bench_function("sz::levenshtein_bytes_bounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let bound = pair_bounds[pair_index]; + let _distance = a + .as_bytes() + .sz_edit_distance_bounded(b_str.as_bytes(), bound); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // StringZilla, UTF-8, unbounded + { + let mut pair_index = 0; + g.bench_function("sz::levenshtein_utf8_unbounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let _distance = a.as_bytes().sz_edit_distance_utf8(b_str.as_bytes()); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // StringZilla, UTF-8, bounded + { + let mut pair_index = 0; + g.bench_function("sz::levenshtein_utf8_bounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let bound = pair_bounds[pair_index]; + let _distance = a + .as_bytes() + .sz_edit_distance_utf8_bounded(b_str.as_bytes(), bound); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // RapidFuzz, ASCII (bytes) unbounded + { + let mut pair_index = 0; + g.bench_function("rapidfuzz::levenshtein_bytes_unbounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let _distance = levenshtein::distance(a.bytes(), b_str.bytes()); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // RapidFuzz, ASCII (bytes) bounded + { + let mut pair_index = 0; + g.bench_function("rapidfuzz::levenshtein_bytes_bounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let bound = pair_bounds[pair_index]; + let _distance = levenshtein::distance_with_args( + a.bytes(), + b_str.bytes(), + &levenshtein::Args::default().score_cutoff(bound), + ); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // RapidFuzz, UTF-8 (chars) unbounded + { + let mut pair_index = 0; + g.bench_function("rapidfuzz::levenshtein_utf8_unbounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let _distance = levenshtein::distance(a.chars(), b_str.chars()); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } + + // RapidFuzz, UTF-8 (chars) bounded + { + let mut pair_index = 0; + g.bench_function("rapidfuzz::levenshtein_utf8_bounded", |b| { + b.iter(|| { + let (a, b_str) = pairs[pair_index]; + let bound = pair_bounds[pair_index]; + let _distance = levenshtein::distance_with_args( + a.chars(), + b_str.chars(), + &levenshtein::Args::default().score_cutoff(bound), + ); + pair_index = (pair_index + 1) % pairs.len(); + }) + }); + } +} + +fn main() { + log_stringzilla_metadata(); + let mut criterion = configure_bench(); + bench_levenshtein(&mut criterion); + criterion.final_summary(); +} diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index 83bd5a8..0000000 --- a/src/lib.rs +++ /dev/null @@ -1,12 +0,0 @@ -use rand::{rngs::ThreadRng, Rng}; - -pub fn open() -> Vec { - let path = std::env::var("FILE").unwrap_or_default(); - let file = std::fs::read(path).unwrap(); - file -} - -pub fn random_token<'a, 'b>(rng: &'b mut ThreadRng, file: &'a Vec) -> &'a [u8] { - let tokens: Vec<&[u8]> = file.split(|c| *c == b' ').collect(); - tokens[rng.gen_range(0..tokens.len())] -}