From 1fd934086417decf2ce10d15e8e579916a044b2b Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Sat, 23 Aug 2025 09:28:17 -0700 Subject: [PATCH 01/10] WIP: implement duplicate detection logic --- crates/bitwarden-vault/src/cipher/login.rs | 14 ++++++++++++++ crates/bitwarden-vault/src/lib.rs | 1 + crates/bitwarden-vault/src/traits/duplicates.rs | 5 +++++ crates/bitwarden-vault/src/traits/mod.rs | 1 + 4 files changed, 21 insertions(+) create mode 100644 crates/bitwarden-vault/src/traits/duplicates.rs create mode 100644 crates/bitwarden-vault/src/traits/mod.rs diff --git a/crates/bitwarden-vault/src/cipher/login.rs b/crates/bitwarden-vault/src/cipher/login.rs index 5e6d952ea..5c7d68f48 100644 --- a/crates/bitwarden-vault/src/cipher/login.rs +++ b/crates/bitwarden-vault/src/cipher/login.rs @@ -18,6 +18,7 @@ use wasm_bindgen::prelude::wasm_bindgen; use super::cipher::CipherKind; use crate::{cipher::cipher::CopyableCipherFields, Cipher, VaultParseError}; +use crate::traits::duplicates::HasDuplicates; #[allow(missing_docs)] #[derive(Clone, Copy, Serialize_repr, Deserialize_repr, Debug, PartialEq)] @@ -326,6 +327,19 @@ pub struct LoginListView { pub uris: Option>, } +impl HasDuplicates for LoginListView { + fn find_duplicates(&self, strategy: UriMatchType) -> Vec { + match strategy { + UriMatchType::Domain => todo!(), + UriMatchType::Host => todo!(), + UriMatchType::StartsWith => todo!(), + UriMatchType::Exact => todo!(), + UriMatchType::RegularExpression => todo!(), + UriMatchType::Never => todo!(), + } + } +} + impl CompositeEncryptable for LoginUriView { fn encrypt_composite( &self, diff --git a/crates/bitwarden-vault/src/lib.rs b/crates/bitwarden-vault/src/lib.rs index c9a8df332..9376cfa8e 100644 --- a/crates/bitwarden-vault/src/lib.rs +++ b/crates/bitwarden-vault/src/lib.rs @@ -6,6 +6,7 @@ uniffi::setup_scaffolding!(); mod uniffi_support; mod cipher; +pub(crate) mod traits; pub use cipher::*; mod folder; pub use folder::*; diff --git a/crates/bitwarden-vault/src/traits/duplicates.rs b/crates/bitwarden-vault/src/traits/duplicates.rs new file mode 100644 index 000000000..56c837972 --- /dev/null +++ b/crates/bitwarden-vault/src/traits/duplicates.rs @@ -0,0 +1,5 @@ +use crate::cipher::login::UriMatchType; + +pub(crate) trait HasDuplicates { + fn find_duplicates(&self, strategy: UriMatchType) -> Vec; +} \ No newline at end of file diff --git a/crates/bitwarden-vault/src/traits/mod.rs b/crates/bitwarden-vault/src/traits/mod.rs new file mode 100644 index 000000000..febcf03a2 --- /dev/null +++ b/crates/bitwarden-vault/src/traits/mod.rs @@ -0,0 +1 @@ +pub(crate) mod duplicates; \ No newline at end of file From 941416e61e99a8b52a343b6515af374272233d12 Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Wed, 27 Aug 2025 15:58:41 -0700 Subject: [PATCH 02/10] WIP: duplicate detection TODO: add tests and review guidelines --- Cargo.lock | 38 +++++++--- crates/bitwarden-vault/Cargo.toml | 6 +- crates/bitwarden-vault/src/cipher/login.rs | 21 ++---- .../src/duplicate_detection.rs | 69 +++++++++++++++++++ crates/bitwarden-vault/src/lib.rs | 3 +- .../bitwarden-vault/src/traits/duplicates.rs | 5 -- crates/bitwarden-vault/src/traits/mod.rs | 1 - 7 files changed, 110 insertions(+), 33 deletions(-) create mode 100644 crates/bitwarden-vault/src/duplicate_detection.rs delete mode 100644 crates/bitwarden-vault/src/traits/duplicates.rs delete mode 100644 crates/bitwarden-vault/src/traits/mod.rs diff --git a/Cargo.lock b/Cargo.lock index b1b92ed84..7927721bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -777,6 +777,7 @@ dependencies = [ "data-encoding", "hmac", "percent-encoding", + "psl", "reqwest", "serde", "serde_json", @@ -787,6 +788,7 @@ dependencies = [ "tokio", "tsify", "uniffi", + "url", "uuid", "wasm-bindgen", "wasm-bindgen-futures", @@ -1724,9 +1726,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "form_urlencoded" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" dependencies = [ "percent-encoding", ] @@ -2265,9 +2267,9 @@ dependencies = [ [[package]] name = "idna" -version = "1.0.3" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" dependencies = [ "idna_adapter", "smallvec", @@ -2888,9 +2890,9 @@ dependencies = [ [[package]] name = "percent-encoding" -version = "2.3.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "pin-project-lite" @@ -3077,6 +3079,21 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "psl" +version = "2.1.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a4d5ec1bed313b61a9d525e8549493538aea497056a5484f5398b1a05bb8261" +dependencies = [ + "psl-types", +] + +[[package]] +name = "psl-types" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" + [[package]] name = "public-suffix" version = "0.1.1" @@ -4683,13 +4700,14 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.4" +version = "2.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" dependencies = [ "form_urlencoded", - "idna 1.0.3", + "idna 1.1.0", "percent-encoding", + "serde", ] [[package]] @@ -4722,7 +4740,7 @@ version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43fb22e1a008ece370ce08a3e9e4447a910e92621bb49b85d6e48a45397e7cfa" dependencies = [ - "idna 1.0.3", + "idna 1.1.0", "once_cell", "regex", "serde", diff --git a/crates/bitwarden-vault/Cargo.toml b/crates/bitwarden-vault/Cargo.toml index fd5e40540..26383a43f 100644 --- a/crates/bitwarden-vault/Cargo.toml +++ b/crates/bitwarden-vault/Cargo.toml @@ -18,7 +18,7 @@ keywords.workspace = true uniffi = [ "bitwarden-core/uniffi", "bitwarden-crypto/uniffi", - "dep:uniffi" + "dep:uniffi", ] # Uniffi bindings wasm = [ "bitwarden-collections/wasm", @@ -26,7 +26,7 @@ wasm = [ "bitwarden-encoding/wasm", "dep:tsify", "dep:wasm-bindgen", - "dep:wasm-bindgen-futures" + "dep:wasm-bindgen-futures", ] # WASM support [dependencies] @@ -43,6 +43,7 @@ chrono = { workspace = true } data-encoding = { workspace = true } hmac = ">=0.12.1, <0.13" percent-encoding = ">=2.1, <3.0" +psl = "~2.1.135" reqwest = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } @@ -52,6 +53,7 @@ sha2 = ">=0.10.6, <0.11" thiserror = { workspace = true } tsify = { workspace = true, optional = true } uniffi = { workspace = true, optional = true } +url = "~2.5.7" uuid = { workspace = true } wasm-bindgen = { workspace = true, optional = true } wasm-bindgen-futures = { workspace = true, optional = true } diff --git a/crates/bitwarden-vault/src/cipher/login.rs b/crates/bitwarden-vault/src/cipher/login.rs index 5c7d68f48..d37f5a967 100644 --- a/crates/bitwarden-vault/src/cipher/login.rs +++ b/crates/bitwarden-vault/src/cipher/login.rs @@ -18,7 +18,6 @@ use wasm_bindgen::prelude::wasm_bindgen; use super::cipher::CipherKind; use crate::{cipher::cipher::CopyableCipherFields, Cipher, VaultParseError}; -use crate::traits::duplicates::HasDuplicates; #[allow(missing_docs)] #[derive(Clone, Copy, Serialize_repr, Deserialize_repr, Debug, PartialEq)] @@ -35,6 +34,13 @@ pub enum UriMatchType { Never = 5, } +pub enum DuplicateUriMatchType { + Domain, + Hostname, + Host, + Exact, +} + #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase", deny_unknown_fields)] #[cfg_attr(feature = "uniffi", derive(uniffi::Record))] @@ -327,19 +333,6 @@ pub struct LoginListView { pub uris: Option>, } -impl HasDuplicates for LoginListView { - fn find_duplicates(&self, strategy: UriMatchType) -> Vec { - match strategy { - UriMatchType::Domain => todo!(), - UriMatchType::Host => todo!(), - UriMatchType::StartsWith => todo!(), - UriMatchType::Exact => todo!(), - UriMatchType::RegularExpression => todo!(), - UriMatchType::Never => todo!(), - } - } -} - impl CompositeEncryptable for LoginUriView { fn encrypt_composite( &self, diff --git a/crates/bitwarden-vault/src/duplicate_detection.rs b/crates/bitwarden-vault/src/duplicate_detection.rs new file mode 100644 index 000000000..72ee26dd0 --- /dev/null +++ b/crates/bitwarden-vault/src/duplicate_detection.rs @@ -0,0 +1,69 @@ +use std::{collections::HashMap, str::FromStr}; + +use bitwarden_crypto::EncString; +use url::Url; + +use crate::{cipher::login::DuplicateUriMatchType, Cipher}; + +fn normalize_name_for_matching(s: &str) -> String { + // currently only removes internal and external whitespace + s.chars().filter(|c| !c.is_whitespace()).collect() +} + +fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Option { + match strategy { + DuplicateUriMatchType::Domain => Url::parse(uri).ok().and_then(|url| { + url.host_str() + .and_then(|hostname| psl::domain_str(hostname).map(|domain| domain.to_string())) + }), + DuplicateUriMatchType::Hostname => Url::parse(uri) + .ok() + .and_then(|url| url.host_str().map(|hostname| hostname.to_string())), + DuplicateUriMatchType::Host => Url::parse(uri).ok().map(|url| { + let hostname = url.host_str().map(|hostname| hostname.to_string()); + let default_port = if url.scheme() == "http" { 80 } else { 443 }; + format!( + "{}:{}", + hostname.unwrap_or_default(), + url.port().unwrap_or(default_port) + ) + }), + DuplicateUriMatchType::Exact => Some(uri.to_string()), + } +} + +fn find_duplicates<'a>(ciphers: &'a [Cipher], strategy: DuplicateUriMatchType) -> Vec<&'a Cipher> { + let mut counts: HashMap> = HashMap::new(); + + for cipher in ciphers.iter() { + if let Some(login) = cipher.login.as_ref() { + // for login ciphers, match on username and uri + if let Some(uris) = login.uris.as_ref() { + let empty_string = + EncString::from_str("").expect("Failed to create empty EncString"); + let username = login.username.as_ref().unwrap_or(&empty_string).to_string(); + for login_uri_view in uris.iter() { + if let Some(uri) = login_uri_view.uri.as_ref() { + if let Some(normalized_uri) = + normalize_uri_for_matching(&uri.to_string(), &strategy) + { + let key = format!("USERNAME+URI:{}:{}", username, normalized_uri); + counts.entry(key).or_default().push(cipher); + } + } + } + } + } else { + // if not login, match on cipher name + let name = normalize_name_for_matching(&cipher.name.to_string()); + let key = format!("NAME:{}", name); + counts.entry(key).or_default().push(cipher); + } + } + + counts + .into_values() + .filter(|ciphers| ciphers.len() >= 2) + .flatten() + .collect() +} diff --git a/crates/bitwarden-vault/src/lib.rs b/crates/bitwarden-vault/src/lib.rs index 9376cfa8e..6df60c407 100644 --- a/crates/bitwarden-vault/src/lib.rs +++ b/crates/bitwarden-vault/src/lib.rs @@ -6,8 +6,9 @@ uniffi::setup_scaffolding!(); mod uniffi_support; mod cipher; -pub(crate) mod traits; pub use cipher::*; +mod duplicate_detection; +pub use duplicate_detection::*; mod folder; pub use folder::*; mod password_history; diff --git a/crates/bitwarden-vault/src/traits/duplicates.rs b/crates/bitwarden-vault/src/traits/duplicates.rs deleted file mode 100644 index 56c837972..000000000 --- a/crates/bitwarden-vault/src/traits/duplicates.rs +++ /dev/null @@ -1,5 +0,0 @@ -use crate::cipher::login::UriMatchType; - -pub(crate) trait HasDuplicates { - fn find_duplicates(&self, strategy: UriMatchType) -> Vec; -} \ No newline at end of file diff --git a/crates/bitwarden-vault/src/traits/mod.rs b/crates/bitwarden-vault/src/traits/mod.rs deleted file mode 100644 index febcf03a2..000000000 --- a/crates/bitwarden-vault/src/traits/mod.rs +++ /dev/null @@ -1 +0,0 @@ -pub(crate) mod duplicates; \ No newline at end of file From a65ff0308cd9dbb71dfd73a17cd854e42e88dfd0 Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Wed, 27 Aug 2025 16:00:41 -0700 Subject: [PATCH 03/10] WIP: duplicate detection TODO: add tests and review guidelines --- Cargo.toml | 30 ++++++++++++------------- crates/bitwarden-auth/Cargo.toml | 2 +- crates/bitwarden-collections/Cargo.toml | 2 +- crates/bitwarden-core/Cargo.toml | 4 ++-- crates/bitwarden-error-macro/Cargo.toml | 12 +++++----- crates/bitwarden-error/Cargo.toml | 8 +++---- crates/bitwarden-exporters/Cargo.toml | 2 +- crates/bitwarden-generators/Cargo.toml | 2 +- crates/bitwarden-ipc/Cargo.toml | 2 +- crates/bitwarden-send/Cargo.toml | 2 +- crates/bitwarden-ssh/Cargo.toml | 2 +- crates/bitwarden-threading/Cargo.toml | 2 +- crates/bitwarden-uniffi/Cargo.toml | 4 ++-- crates/bitwarden-uuid-macro/Cargo.toml | 6 ++--- crates/bitwarden-uuid/Cargo.toml | 6 ++--- 15 files changed, 43 insertions(+), 43 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index cd2aa58b8..6409d75b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,9 @@ keywords = ["bitwarden"] # Define dependencies that are expected to be consistent across all crates [workspace.dependencies] + +# External crates that are expected to maintain a consistent version across all crates +async-trait = ">=0.1.80, <0.2" bitwarden = { path = "crates/bitwarden", version = "=1.0.0" } bitwarden-api-api = { path = "crates/bitwarden-api-api", version = "=1.0.0" } bitwarden-api-identity = { path = "crates/bitwarden-api-identity", version = "=1.0.0" } @@ -43,9 +46,6 @@ bitwarden-uniffi-error = { path = "crates/bitwarden-uniffi-error", version = "=1 bitwarden-uuid = { path = "crates/bitwarden-uuid", version = "=1.0.0" } bitwarden-uuid-macro = { path = "crates/bitwarden-uuid-macro", version = "=1.0.0" } bitwarden-vault = { path = "crates/bitwarden-vault", version = "=1.0.0" } - -# External crates that are expected to maintain a consistent version across all crates -async-trait = ">=0.1.80, <0.2" chrono = { version = ">=0.4.26, <0.5", features = [ "clock", "serde", @@ -63,11 +63,11 @@ reqwest = { version = ">=0.12.5, <0.13", features = [ ], default-features = false } schemars = { version = ">=1.0.0, <2.0", features = ["uuid1", "chrono04"] } serde = { version = ">=1.0, <2.0", features = ["derive"] } +serde-wasm-bindgen = ">=0.6.0, <0.7" serde_bytes = { version = ">=0.11.17, <0.12.0" } serde_json = ">=1.0.96, <2.0" serde_qs = ">=0.12.0, <0.16" serde_repr = ">=0.1.12, <0.2" -serde-wasm-bindgen = ">=0.6.0, <0.7" syn = ">=2.0.87, <3" thiserror = ">=1.0.40, <3" tokio = { version = "1.36.0", features = ["macros"] } @@ -82,12 +82,6 @@ wasm-bindgen-futures = "0.4.41" wasm-bindgen-test = "0.3.45" wiremock = ">=0.6.0, <0.7" -# There is an incompatibility when using pkcs5 and chacha20 on wasm builds. This can be removed once a new -# rustcrypto-formats crate version is released since the fix has been upstreamed. -# https://github.com/RustCrypto/formats/pull/1625 -[patch.crates-io] -pkcs5 = { git = "https://github.com/bitwarden/rustcrypto-formats.git", rev = "2b27c63034217dd126bbf5ed874da51b84f8c705" } - [workspace.lints.clippy] unused_async = "deny" unwrap_used = "deny" @@ -96,17 +90,23 @@ string_slice = "warn" [workspace.lints.rust] missing_docs = "warn" -# Compile all dependencies with some optimizations when building this crate on debug -# This slows down clean builds by about 50%, but the resulting binaries can be orders of magnitude faster -# As clean builds won't occur very often, this won't slow down the development process -[profile.dev.package."*"] -opt-level = 2 +# There is an incompatibility when using pkcs5 and chacha20 on wasm builds. This can be removed once a new +# rustcrypto-formats crate version is released since the fix has been upstreamed. +# https://github.com/RustCrypto/formats/pull/1625 +[patch.crates-io] +pkcs5 = { git = "https://github.com/bitwarden/rustcrypto-formats.git", rev = "2b27c63034217dd126bbf5ed874da51b84f8c705" } # Turn on a small amount of optimisation in development mode. This might interfere when trying to use a debugger # if the compiler decides to optimize some code away, if that's the case, it can be set to 0 or commented out [profile.dev] opt-level = 1 +# Compile all dependencies with some optimizations when building this crate on debug +# This slows down clean builds by about 50%, but the resulting binaries can be orders of magnitude faster +# As clean builds won't occur very often, this won't slow down the development process +[profile.dev.package."*"] +opt-level = 2 + # Turn on LTO on release mode [profile.release] codegen-units = 1 diff --git a/crates/bitwarden-auth/Cargo.toml b/crates/bitwarden-auth/Cargo.toml index 787a5b2b1..ab2fdb0eb 100644 --- a/crates/bitwarden-auth/Cargo.toml +++ b/crates/bitwarden-auth/Cargo.toml @@ -20,7 +20,7 @@ wasm = [ "bitwarden-core/wasm", "dep:tsify", "dep:wasm-bindgen", - "dep:wasm-bindgen-futures" + "dep:wasm-bindgen-futures", ] # WASM support [dependencies] diff --git a/crates/bitwarden-collections/Cargo.toml b/crates/bitwarden-collections/Cargo.toml index 72d824687..6c144ba9d 100644 --- a/crates/bitwarden-collections/Cargo.toml +++ b/crates/bitwarden-collections/Cargo.toml @@ -14,7 +14,7 @@ keywords.workspace = true uniffi = [ "bitwarden-core/uniffi", "bitwarden-crypto/uniffi", - "dep:uniffi" + "dep:uniffi", ] # Uniffi bindings wasm = ["bitwarden-core/wasm", "dep:tsify", "dep:wasm-bindgen"] # WASM support diff --git a/crates/bitwarden-core/Cargo.toml b/crates/bitwarden-core/Cargo.toml index a76a1a92e..fdc1471f6 100644 --- a/crates/bitwarden-core/Cargo.toml +++ b/crates/bitwarden-core/Cargo.toml @@ -17,7 +17,7 @@ keywords.workspace = true [features] internal = ["dep:zxcvbn"] no-memory-hardening = [ - "bitwarden-crypto/no-memory-hardening" + "bitwarden-crypto/no-memory-hardening", ] # Disable memory hardening features secrets = [] # Secrets manager API uniffi = [ @@ -29,7 +29,7 @@ wasm = [ "bitwarden-error/wasm", "dep:wasm-bindgen", "dep:wasm-bindgen-futures", - "dep:tsify" + "dep:tsify", ] # WASM support [dependencies] diff --git a/crates/bitwarden-error-macro/Cargo.toml b/crates/bitwarden-error-macro/Cargo.toml index 19d78f001..ac88af95b 100644 --- a/crates/bitwarden-error-macro/Cargo.toml +++ b/crates/bitwarden-error-macro/Cargo.toml @@ -14,6 +14,9 @@ repository.workspace = true license-file.workspace = true keywords.workspace = true +[lib] +proc-macro = true + [features] wasm = [] @@ -23,12 +26,6 @@ proc-macro2 = { workspace = true } quote = { workspace = true } syn = { workspace = true } -[lints] -workspace = true - -[lib] -proc-macro = true - [dev-dependencies] bitwarden-error = { workspace = true, features = ["wasm"] } js-sys.workspace = true @@ -36,3 +33,6 @@ serde.workspace = true thiserror.workspace = true tsify.workspace = true wasm-bindgen.workspace = true + +[lints] +workspace = true diff --git a/crates/bitwarden-error/Cargo.toml b/crates/bitwarden-error/Cargo.toml index 9757669ed..ebbfe7a8c 100644 --- a/crates/bitwarden-error/Cargo.toml +++ b/crates/bitwarden-error/Cargo.toml @@ -19,7 +19,7 @@ wasm = [ "bitwarden-error-macro/wasm", "dep:js-sys", "dep:tsify", - "dep:wasm-bindgen" + "dep:wasm-bindgen", ] [dependencies] @@ -28,10 +28,10 @@ js-sys = { workspace = true, optional = true } tsify = { workspace = true, optional = true } wasm-bindgen = { workspace = true, optional = true } -[lints] -workspace = true - [dev-dependencies] serde.workspace = true trybuild = "1.0.101" wasm-bindgen-test = { workspace = true } + +[lints] +workspace = true diff --git a/crates/bitwarden-exporters/Cargo.toml b/crates/bitwarden-exporters/Cargo.toml index 4f39acfb8..f3fe09280 100644 --- a/crates/bitwarden-exporters/Cargo.toml +++ b/crates/bitwarden-exporters/Cargo.toml @@ -21,7 +21,7 @@ wasm = [ "bitwarden-collections/wasm", "bitwarden-vault/wasm", "dep:tsify", - "dep:wasm-bindgen" + "dep:wasm-bindgen", ] # WebAssembly bindings [dependencies] diff --git a/crates/bitwarden-generators/Cargo.toml b/crates/bitwarden-generators/Cargo.toml index 93f601cee..e3f4ea080 100644 --- a/crates/bitwarden-generators/Cargo.toml +++ b/crates/bitwarden-generators/Cargo.toml @@ -19,7 +19,7 @@ uniffi = ["dep:uniffi"] # Uniffi bindings wasm = [ "bitwarden-core/wasm", "dep:tsify", - "dep:wasm-bindgen" + "dep:wasm-bindgen", ] # WebAssembly bindings [dependencies] diff --git a/crates/bitwarden-ipc/Cargo.toml b/crates/bitwarden-ipc/Cargo.toml index e181292fe..ae4036230 100644 --- a/crates/bitwarden-ipc/Cargo.toml +++ b/crates/bitwarden-ipc/Cargo.toml @@ -16,7 +16,7 @@ wasm = [ "dep:wasm-bindgen-futures", "dep:js-sys", "bitwarden-error/wasm", - "bitwarden-threading/wasm" + "bitwarden-threading/wasm", ] # WASM support [dependencies] diff --git a/crates/bitwarden-send/Cargo.toml b/crates/bitwarden-send/Cargo.toml index 161be4b93..a5b500b8d 100644 --- a/crates/bitwarden-send/Cargo.toml +++ b/crates/bitwarden-send/Cargo.toml @@ -18,7 +18,7 @@ keywords.workspace = true uniffi = [ "bitwarden-core/uniffi", "bitwarden-crypto/uniffi", - "dep:uniffi" + "dep:uniffi", ] # Uniffi bindings [dependencies] diff --git a/crates/bitwarden-ssh/Cargo.toml b/crates/bitwarden-ssh/Cargo.toml index 594103eae..870ac043d 100644 --- a/crates/bitwarden-ssh/Cargo.toml +++ b/crates/bitwarden-ssh/Cargo.toml @@ -19,7 +19,7 @@ keywords.workspace = true wasm = [ "bitwarden-error/wasm", "dep:tsify", - "dep:wasm-bindgen" + "dep:wasm-bindgen", ] # WASM support uniffi = ["dep:uniffi"] # Uniffi bindings diff --git a/crates/bitwarden-threading/Cargo.toml b/crates/bitwarden-threading/Cargo.toml index b322847e3..86ebae0f3 100644 --- a/crates/bitwarden-threading/Cargo.toml +++ b/crates/bitwarden-threading/Cargo.toml @@ -17,7 +17,7 @@ wasm = [ "dep:wasm-bindgen", "dep:wasm-bindgen-futures", "dep:js-sys", - "dep:gloo-timers" + "dep:gloo-timers", ] [dependencies] diff --git a/crates/bitwarden-uniffi/Cargo.toml b/crates/bitwarden-uniffi/Cargo.toml index 76a489869..b74299955 100644 --- a/crates/bitwarden-uniffi/Cargo.toml +++ b/crates/bitwarden-uniffi/Cargo.toml @@ -12,12 +12,12 @@ repository.workspace = true license-file.workspace = true keywords.workspace = true -[features] - [lib] crate-type = ["lib", "staticlib", "cdylib"] bench = false +[features] + [dependencies] async-trait = { workspace = true } bitwarden-collections = { workspace = true, features = ["uniffi"] } diff --git a/crates/bitwarden-uuid-macro/Cargo.toml b/crates/bitwarden-uuid-macro/Cargo.toml index de8740350..ec9d8fb69 100644 --- a/crates/bitwarden-uuid-macro/Cargo.toml +++ b/crates/bitwarden-uuid-macro/Cargo.toml @@ -14,12 +14,12 @@ repository.workspace = true license-file.workspace = true keywords.workspace = true +[lib] +proc-macro = true + [dependencies] quote = { workspace = true } syn = { workspace = true } [lints] workspace = true - -[lib] -proc-macro = true diff --git a/crates/bitwarden-uuid/Cargo.toml b/crates/bitwarden-uuid/Cargo.toml index 37013e93a..69fbc20a5 100644 --- a/crates/bitwarden-uuid/Cargo.toml +++ b/crates/bitwarden-uuid/Cargo.toml @@ -17,12 +17,12 @@ keywords.workspace = true [dependencies] bitwarden-uuid-macro = { workspace = true } -[lints] -workspace = true - [dev-dependencies] serde = { workspace = true } serde-wasm-bindgen = { workspace = true } serde_json = { workspace = true } uuid = { workspace = true } wasm-bindgen-test = { workspace = true } + +[lints] +workspace = true From 8eee7d8f5439fee40066db46ae006c6dee7583e0 Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Thu, 28 Aug 2025 18:19:03 -0700 Subject: [PATCH 04/10] WIP: refactored to match logic in web vault & added preliminary tests --- crates/bitwarden-vault/src/cipher/login.rs | 7 - .../src/duplicate_detection.rs | 655 +++++++++++++++++- 2 files changed, 622 insertions(+), 40 deletions(-) diff --git a/crates/bitwarden-vault/src/cipher/login.rs b/crates/bitwarden-vault/src/cipher/login.rs index d37f5a967..5e6d952ea 100644 --- a/crates/bitwarden-vault/src/cipher/login.rs +++ b/crates/bitwarden-vault/src/cipher/login.rs @@ -34,13 +34,6 @@ pub enum UriMatchType { Never = 5, } -pub enum DuplicateUriMatchType { - Domain, - Hostname, - Host, - Exact, -} - #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase", deny_unknown_fields)] #[cfg_attr(feature = "uniffi", derive(uniffi::Record))] diff --git a/crates/bitwarden-vault/src/duplicate_detection.rs b/crates/bitwarden-vault/src/duplicate_detection.rs index 72ee26dd0..40be4135d 100644 --- a/crates/bitwarden-vault/src/duplicate_detection.rs +++ b/crates/bitwarden-vault/src/duplicate_detection.rs @@ -1,15 +1,70 @@ -use std::{collections::HashMap, str::FromStr}; +use std::collections::{hash_map::Entry, HashMap, HashSet}; -use bitwarden_crypto::EncString; use url::Url; -use crate::{cipher::login::DuplicateUriMatchType, Cipher}; +use crate::Cipher; -fn normalize_name_for_matching(s: &str) -> String { - // currently only removes internal and external whitespace - s.chars().filter(|c| !c.is_whitespace()).collect() +/// A grouping of ciphers considered duplicates under a specific display key. +#[derive(Debug)] +pub struct DuplicateSet<'a> { + /// Human-readable grouping key (e.g. "username+uri: alice @ example.com"). + pub key: String, + /// All ciphers participating in this duplicate group. + pub ciphers: Vec<&'a Cipher>, } +/// Strategy for determining whether two login URIs should be considered the same +/// when detecting duplicate ciphers. +/// +/// The strategies progressively narrow what is considered a match: +/// * Domain: compares only the registrable domain (e.g. `sub.example.co.uk` -> `example.co.uk`). +/// * Hostname: compares the full hostname without port (e.g. `sub.example.com`). +/// * Host: compares hostname plus a port (if present) +/// * Exact: compares the full original URI string verbatim. +pub enum DuplicateUriMatchType { + /// Match by the effective registrable domain portion of the host. + Domain, + /// Match by the full hostname (subdomains preserved), excluding any port. + Hostname, + /// Match by hostname plus a port (if present). + Host, + /// Match by the exact original URI string with no normalization applied. + Exact, +} + +/// Normalize a cipher name for duplicate comparison: +/// +/// 1. Removes *all* Unicode whitespace characters (spaces, tabs, newlines, etc.) +/// 2. Converts the remaining characters to lowercase +/// +/// Returns a newly allocated `String` containing the normalized form. +fn normalize_name_for_matching(name: &str) -> String { + // currently only removes internal and external whitespace and lowercases + let normalized = name + .chars() + .filter(|c| !c.is_whitespace()) + .collect::() + .to_lowercase(); + normalized +} + +/// Normalize a URI according to the chosen duplicate matching strategy. +/// +/// Returns an `Option` where: +/// * Domain uses the public suffix list (psl) to collapse `a.b.example.co.uk` -> `example.co.uk`. +/// * Hostname preserves the full hostname exactly as parsed (no port). +/// * Host appends port (if present) to the hostname. +/// * Exact performs no normalization +/// +/// Examples: +/// ```text +/// Strategy=Domain: https://app.eu.example.com/login => Some("example.com") +/// Strategy=Hostname: https://app.eu.example.com/login => Some("app.eu.example.com") +/// Strategy=Host: https://app.eu.example.com/login => Some("app.eu.example.com:443") +/// Strategy=Host: http://example.com:80 => Some("example.com:80") +/// Strategy=Exact: not a uri => Some("not a uri") +/// Strategy=Domain: not a uri => None (parse fails) +/// ``` fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Option { match strategy { DuplicateUriMatchType::Domain => Url::parse(uri).ok().and_then(|url| { @@ -21,49 +76,583 @@ fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Op .and_then(|url| url.host_str().map(|hostname| hostname.to_string())), DuplicateUriMatchType::Host => Url::parse(uri).ok().map(|url| { let hostname = url.host_str().map(|hostname| hostname.to_string()); - let default_port = if url.scheme() == "http" { 80 } else { 443 }; format!( "{}:{}", hostname.unwrap_or_default(), - url.port().unwrap_or(default_port) + url.port().unwrap_or_default() ) }), DuplicateUriMatchType::Exact => Some(uri.to_string()), } } -fn find_duplicates<'a>(ciphers: &'a [Cipher], strategy: DuplicateUriMatchType) -> Vec<&'a Cipher> { - let mut counts: HashMap> = HashMap::new(); +/// Build groups of duplicated ciphers +/// +/// Buckets (size >= 2 kept): +/// * username+uri: username and each normalized URI (strategy-specific) +/// * username+name: username and normalized name +/// * name-only: normalized name when the username is missing +/// +/// Normalization: +/// * URIs: [normalize_uri_for_matching] and the provided strategy +/// * Names: [normalize_name_for_matching] (whitespace removed, lowercase) +/// +/// When different buckets contain the exact same cipher membership, only the +/// highest‑precedence bucket is retained (username+uri > username+name > name-only). +/// +/// Display key formats: +/// * username+uri: @ +/// * username+name: & +/// * username+name: & (blank username for name-only) +/// +/// A cipher can appear in multiple returned sets if it legitimately matches +/// distinct groups (e.g., the same username across two distinct URIs). +pub fn find_duplicate_sets<'a>( + ciphers: &'a [Cipher], + strategy: DuplicateUriMatchType, +) -> Vec> { + #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] + enum BucketKind { + UsernameUri, + UsernameName, + NameOnly, + } + + impl BucketKind { + fn precedence(self) -> u8 { + match self { + BucketKind::UsernameUri => 3, + BucketKind::UsernameName => 2, + BucketKind::NameOnly => 1, + } + } + } + + // (BucketKind, grouping_key) -> Vec<&Cipher> + let mut buckets: HashMap<(BucketKind, String), Vec<&'a Cipher>> = HashMap::new(); for cipher in ciphers.iter() { - if let Some(login) = cipher.login.as_ref() { - // for login ciphers, match on username and uri - if let Some(uris) = login.uris.as_ref() { - let empty_string = - EncString::from_str("").expect("Failed to create empty EncString"); - let username = login.username.as_ref().unwrap_or(&empty_string).to_string(); - for login_uri_view in uris.iter() { - if let Some(uri) = login_uri_view.uri.as_ref() { - if let Some(normalized_uri) = - normalize_uri_for_matching(&uri.to_string(), &strategy) - { - let key = format!("USERNAME+URI:{}:{}", username, normalized_uri); - counts.entry(key).or_default().push(cipher); - } + // Extract username (if login) and list of URIs + let (username, uri_strings): (String, Vec) = if let Some(login) = &cipher.login { + let username = login + .username + .as_ref() + .map(|u| u.to_string()) + .unwrap_or_default() + .trim() + .to_string(); + let uris = login + .uris + .as_ref() + .map(|all_uris| { + all_uris + .iter() + .filter_map(|curr_uri| curr_uri.uri.as_ref().map(|e| e.to_string())) + .collect::>() + }) + .unwrap_or_default(); + (username, uris) + } else { + (String::new(), Vec::new()) + }; + let has_username = !username.is_empty(); + + // Username + URI buckets (dedupe normalized URIs per cipher) + if has_username && !uri_strings.is_empty() { + let mut per_cipher_seen: HashSet = HashSet::new(); + for raw_uri in uri_strings.iter() { + if let Some(norm_uri) = normalize_uri_for_matching(raw_uri, &strategy) { + if per_cipher_seen.insert(norm_uri.clone()) { + buckets + .entry((BucketKind::UsernameUri, format!("{username}||{norm_uri}"))) + .or_default() + .push(cipher); } } } - } else { - // if not login, match on cipher name - let name = normalize_name_for_matching(&cipher.name.to_string()); - let key = format!("NAME:{}", name); - counts.entry(key).or_default().push(cipher); + } + + // Name-based buckets + let raw_name = cipher.name.to_string(); + let trimmed_name = raw_name.trim(); + if !trimmed_name.is_empty() { + let norm_name = normalize_name_for_matching(trimmed_name); + if !norm_name.is_empty() { + // guard in case normalization strips everything + if has_username { + buckets + .entry((BucketKind::UsernameName, format!("{username}||{norm_name}"))) + .or_default() + .push(cipher); + } else { + buckets + .entry((BucketKind::NameOnly, norm_name)) + .or_default() + .push(cipher); + } + } + } + } + + // Helper to produce a stable, order-independent membership signature. + // Prefer stable cipher IDs; fall back to pointer addresses (prefixed) only if an ID is absent. + fn signature(ciphers: &[&Cipher]) -> String { + let mut ids: Vec = ciphers + .iter() + .map(|c| { + if let Some(id) = c.id.as_ref() { + id.to_string() + } else { + // Defensive fallback uses a pointer to cipher memory address (avoids unwrap / + // expect) Prefix with _ptr to avoid accidental collision + // with a real UUID string. + format!("_ptr{:016x}", *c as *const Cipher as usize) + } + }) + .collect(); + ids.sort_unstable(); + ids.join("|") + } + + // signature -> (precedence, BucketKind, grouping_key, members) + let mut strongest_matches: HashMap)> = + HashMap::new(); + + for ((kind, key), members) in buckets.into_iter() { + if members.len() < 2 { + continue; + } + let signature = signature(&members); + let precedence = kind.precedence(); + match strongest_matches.entry(signature) { + Entry::Vacant(vacant) => { + vacant.insert((precedence, kind, key, members)); + } + Entry::Occupied(mut occupied) => { + if precedence > occupied.get().0 { + occupied.insert((precedence, kind, key, members)); + } + } } } - counts + // Convert to DuplicateSet with Web Vault display key formatting + let mut sets: Vec = strongest_matches .into_values() - .filter(|ciphers| ciphers.len() >= 2) - .flatten() - .collect() + .map(|(_p, kind, key, members)| { + let display = match kind { + BucketKind::UsernameUri => { + if let Some((user, uri_part)) = key.split_once("||") { + format!("username+uri: {user} @ {uri_part}") + } else { + format!("username+uri: {key}") + } + } + BucketKind::UsernameName => { + if let Some((user, _canon)) = key.split_once("||") { + let display_name = members + .first() + .map(|c| c.name.to_string()) + .unwrap_or_default(); + let trimmed = display_name.trim(); + format!("username+name: {user} & {trimmed}") + } else { + let display_name = members + .first() + .map(|c| c.name.to_string()) + .unwrap_or_default(); + let trimmed = display_name.trim(); + format!("username+name: & {trimmed}") + } + } + BucketKind::NameOnly => { + let display_name = members + .first() + .map(|c| c.name.to_string()) + .unwrap_or_default(); + let trimmed = display_name.trim(); + format!("username+name: & {trimmed}") + } + }; + DuplicateSet { + key: display, + ciphers: members, + } + }) + .collect(); + + sets.sort_by(|a, b| a.key.cmp(&b.key)); + sets +} + +#[cfg(test)] +mod tests { + use std::str::FromStr; + + use bitwarden_crypto::EncString; + use chrono::Utc; + + use super::*; + use crate::{ + cipher::{ + cipher::{CipherRepromptType, CipherType}, + login::{Login, LoginUri}, + }, + Cipher, + }; // needed for EncString::from_str + + // Helper to build a login cipher + fn make_login_cipher( + id: Option<&str>, + username: Option<&str>, + uris: &[&str], + name: &str, + ) -> Cipher { + Cipher { + id: id.map(|s| s.parse().unwrap()), + organization_id: None, + folder_id: None, + collection_ids: vec![], + key: None, + name: EncString::from_str(name).unwrap(), + notes: None, + r#type: CipherType::Login, + login: Some(Login { + username: username.map(|u| EncString::from_str(u).unwrap()), + password: None, + password_revision_date: None, + uris: if uris.is_empty() { + None + } else { + Some( + uris.iter() + .map(|u| LoginUri { + uri: Some(EncString::from_str(u).unwrap()), + r#match: None, + uri_checksum: None, + }) + .collect(), + ) + }, + totp: None, + autofill_on_page_load: None, + fido2_credentials: None, + }), + identity: None, + card: None, + secure_note: None, + ssh_key: None, + favorite: false, + reprompt: CipherRepromptType::None, + organization_use_totp: false, + edit: false, + permissions: None, + view_password: false, + local_data: None, + attachments: None, + fields: None, + password_history: None, + creation_date: Utc::now(), + deleted_date: None, + revision_date: Utc::now(), + } + } + + // Helper to build non-login cipher (SecureNote) + fn make_note_cipher(id: Option<&str>, name: &str) -> Cipher { + Cipher { + id: id.map(|s| s.parse().unwrap()), + organization_id: None, + folder_id: None, + collection_ids: vec![], + key: None, + name: EncString::from_str(name).unwrap(), + notes: None, + r#type: CipherType::SecureNote, + login: None, + identity: None, + card: None, + secure_note: None, + ssh_key: None, + favorite: false, + reprompt: CipherRepromptType::None, + organization_use_totp: false, + edit: false, + permissions: None, + view_password: false, + local_data: None, + attachments: None, + fields: None, + password_history: None, + creation_date: Utc::now(), + deleted_date: None, + revision_date: Utc::now(), + } + } + + // ---- normalize_name_for_matching tests ---- + #[test] + fn test_normalize_name_empty_and_whitespace() { + assert_eq!(normalize_name_for_matching(""), ""); + assert_eq!(normalize_name_for_matching(" \t \n"), ""); + } + + #[test] + fn test_normalize_name_internal_whitespace_and_case() { + assert_eq!(normalize_name_for_matching("My Site"), "mysite"); + assert_eq!( + normalize_name_for_matching(" Mixed Case Name "), + "mixedcasename" + ); + assert_eq!(normalize_name_for_matching("T A B S"), "tabs"); + assert_eq!( + normalize_name_for_matching("Multi\nLine\tName"), + "multilinename" + ); + } + + #[test] + fn test_normalize_name_unicode_whitespace() { + // Includes non-breaking space (\u{00A0}) and thin space (\u{2009}) + let s = "Name\u{00A0}\u{2009}With\u{00A0}Spaces"; + assert_eq!(normalize_name_for_matching(s), "namewithspaces"); + } + + // ---- normalize_uri_for_matching tests ---- + + #[test] + fn test_normalize_uri_domain_basic() { + let uri = "https://sub.example.co.uk/path"; + // Using PSL should reduce to example.co.uk + let norm = normalize_uri_for_matching(uri, &DuplicateUriMatchType::Domain); + assert_eq!(norm, Some("example.co.uk".to_string())); + } + + #[test] + fn test_normalize_uri_domain_ip_returns_none() { + let uri = "https://192.168.1.10/login"; + // IP address has no registrable domain + let norm = normalize_uri_for_matching(uri, &DuplicateUriMatchType::Domain); + assert_eq!(norm, None); + } + + #[test] + fn test_normalize_uri_hostname_and_host() { + let uri = "https://app.example.com:8443/a"; + assert_eq!( + normalize_uri_for_matching(uri, &DuplicateUriMatchType::Hostname), + Some("app.example.com".to_string()) + ); + // Host strategy uses explicit port only, defaulting to 0 if absent (per current + // implementation) + assert_eq!( + normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), + Some("app.example.com:8443".to_string()) + ); + } + + #[test] + fn test_normalize_uri_host_no_explicit_port_results_zero() { + let uri = "https://example.com/path"; // no explicit port -> :0 by implementation + assert_eq!( + normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), + Some("example.com:0".to_string()) + ); + } + + #[test] + fn test_normalize_uri_exact_invalid_still_returns() { + let raw = "not a uri"; + assert_eq!( + normalize_uri_for_matching(raw, &DuplicateUriMatchType::Exact), + Some(raw.to_string()) + ); + assert_eq!( + normalize_uri_for_matching(raw, &DuplicateUriMatchType::Domain), + None + ); + } + + // ---- find_duplicate_sets tests ---- + + #[test] + fn test_find_duplicate_sets_empty_input() { + let sets = find_duplicate_sets(&[], DuplicateUriMatchType::Domain); + assert!(sets.is_empty()); + } + + #[test] + fn test_username_uri_duplicate_basic() { + let c1 = make_login_cipher( + Some("11111111-1111-1111-1111-111111111111"), + Some("alice"), + &["https://a.example.com"], + "Site A", + ); + let c2 = make_login_cipher( + Some("22222222-2222-2222-2222-222222222222"), + Some("alice"), + &["https://a.example.com"], + "Site B", + ); + let ciphers = vec![c1, c2]; + let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); + assert_eq!(sets.len(), 1); // domain example.com + assert!(sets[0].key.starts_with("username+uri:")); + assert_eq!(sets[0].ciphers.len(), 2); + } + + #[test] + fn test_username_name_precedence_lower_than_uri() { + let c1 = make_login_cipher( + Some("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"), + Some("bob"), + &["https://foo.com"], + "Foo", + ); + let c2 = make_login_cipher( + Some("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"), + Some("bob"), + &["https://foo.com"], + "Foo", + ); + let ciphers = vec![c1, c2]; + let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); + assert_eq!(sets.len(), 1); + assert!(sets[0].key.starts_with("username+uri:")); + } + + #[test] + fn test_name_only_duplicates() { + let n1 = make_note_cipher(Some("33333333-3333-3333-3333-333333333333"), "Shared Note"); + let n2 = make_note_cipher( + Some("44444444-4444-4444-4444-444444444444"), + " shared note ", + ); + let ciphers = vec![n1, n2]; + let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); + assert_eq!(sets.len(), 1); + assert!(sets[0].key.contains("Shared Note") || sets[0].key.contains("shared note")); + } + + #[test] + fn test_cipher_in_multiple_sets_via_distinct_uris() { + let a = make_login_cipher( + Some("55555555-5555-5555-5555-555555555555"), + Some("alice"), + &["https://a.com", "https://b.com"], + "Multi", + ); + let b = make_login_cipher( + Some("66666666-6666-6666-6666-666666666666"), + Some("alice"), + &["https://a.com"], + "One", + ); + let c = make_login_cipher( + Some("77777777-7777-7777-7777-777777777777"), + Some("alice"), + &["https://b.com"], + "Two", + ); + let ciphers = vec![a, b, c]; + let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); + assert_eq!(sets.len(), 2); + let multi_appearances = sets + .iter() + .filter(|s| s.ciphers.iter().any(|c| c.name.to_string() == "Multi")); + assert_eq!(multi_appearances.count(), 2); + } + + #[test] + fn test_identical_membership_multiple_normalized_uris_collapse() { + let c1 = make_login_cipher( + Some("88888888-8888-8888-8888-888888888888"), + Some("user"), + &["https://x.example.com", "https://y.example.com"], + "X", + ); + let c2 = make_login_cipher( + Some("99999999-9999-9999-9999-999999999999"), + Some("user"), + &["https://x.example.com", "https://y.example.com"], + "Y", + ); + let ciphers = vec![c1, c2]; + let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Hostname); + assert_eq!(sets.len(), 1); + assert!( + sets[0].key.contains("user @ x.example.com") + || sets[0].key.contains("user @ y.example.com") + ); + } + + #[test] + fn test_per_cipher_uri_deduplication() { + let c1 = make_login_cipher( + Some("10101010-1010-1010-1010-101010101010"), + Some("u"), + &["https://dup.com", "https://dup.com"], + "A", + ); + let c2 = make_login_cipher( + Some("11111110-1111-1111-1111-111111101111"), + Some("u"), + &["https://dup.com", "https://dup.com"], + "B", + ); + let ciphers = vec![c1, c2]; + let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Hostname); + assert_eq!(sets.len(), 1); + assert_eq!(sets[0].ciphers.len(), 2); + } + + #[test] + fn test_domain_strategy_groups_subdomains() { + let c1 = make_login_cipher( + Some("12121212-1212-1212-1212-121212121212"), + Some("user"), + &["https://a.service.example.com"], + "A", + ); + let c2 = make_login_cipher( + Some("13131313-1313-1313-1313-131313131313"), + Some("user"), + &["https://b.service.example.com"], + "B", + ); + let ciphers_domain = vec![c1.clone(), c2.clone()]; + let sets_domain = find_duplicate_sets(&ciphers_domain, DuplicateUriMatchType::Domain); + assert_eq!(sets_domain.len(), 1); + let ciphers_hostname = vec![c1, c2]; + let sets_hostname = find_duplicate_sets(&ciphers_hostname, DuplicateUriMatchType::Hostname); + assert!(sets_hostname.is_empty()); + } + + #[test] + fn test_missing_ids_still_group_using_pointer_fallback() { + let c1 = make_note_cipher(None, "SameName"); + let c2 = make_note_cipher(None, " same name "); + let ciphers = vec![c1, c2]; + let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); + assert_eq!(sets.len(), 1); + assert!(sets[0].key.contains("SameName") || sets[0].key.contains("same name")); + } + + #[test] + fn test_username_name_does_not_cross_with_name_only() { + let login = make_login_cipher( + Some("14141414-1414-1414-1414-141414141414"), + Some("user"), + &[], + "Duplicate", + ); + let note1 = make_note_cipher(Some("15151515-1515-1515-1515-151515151515"), "Duplicate"); + let note2 = make_note_cipher(Some("16161616-1616-1616-1616-161616161616"), "duplicate"); + let ciphers = vec![login, note1, note2]; + let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); + assert_eq!(sets.len(), 1); + assert!(sets[0].key.starts_with("username+name: &")); + assert_eq!(sets[0].ciphers.len(), 2); + } } From 6dd09702509bbdb7e34848012ab9dd0f101f0b4a Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Fri, 29 Aug 2025 07:59:23 -0700 Subject: [PATCH 05/10] WIP: minor refactors for readability and safety, tests failing due to external files --- .../src/duplicate_detection.rs | 76 +++++++++---------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/crates/bitwarden-vault/src/duplicate_detection.rs b/crates/bitwarden-vault/src/duplicate_detection.rs index 40be4135d..556bc766d 100644 --- a/crates/bitwarden-vault/src/duplicate_detection.rs +++ b/crates/bitwarden-vault/src/duplicate_detection.rs @@ -4,22 +4,22 @@ use url::Url; use crate::Cipher; -/// A grouping of ciphers considered duplicates under a specific display key. +/// A grouping of ciphers considered duplicates under a specific display key +/// that can be derived from login.username, login.uris, and cipher.name #[derive(Debug)] pub struct DuplicateSet<'a> { - /// Human-readable grouping key (e.g. "username+uri: alice @ example.com"). + /// Human-readable key (e.g. "username+uri: alice @ example.com"). pub key: String, - /// All ciphers participating in this duplicate group. + /// All ciphers participating in the duplicate group. pub ciphers: Vec<&'a Cipher>, } -/// Strategy for determining whether two login URIs should be considered the same -/// when detecting duplicate ciphers. +/// Strategy for determining whether two login URIs should be considered duplicates /// /// The strategies progressively narrow what is considered a match: /// * Domain: compares only the registrable domain (e.g. `sub.example.co.uk` -> `example.co.uk`). -/// * Hostname: compares the full hostname without port (e.g. `sub.example.com`). -/// * Host: compares hostname plus a port (if present) +/// * Hostname: compares the full hostname without port (e.g. `sub.example.co.uk`). +/// * Host: compares hostname and specified port (when present) /// * Exact: compares the full original URI string verbatim. pub enum DuplicateUriMatchType { /// Match by the effective registrable domain portion of the host. @@ -39,7 +39,6 @@ pub enum DuplicateUriMatchType { /// /// Returns a newly allocated `String` containing the normalized form. fn normalize_name_for_matching(name: &str) -> String { - // currently only removes internal and external whitespace and lowercases let normalized = name .chars() .filter(|c| !c.is_whitespace()) @@ -67,25 +66,26 @@ fn normalize_name_for_matching(name: &str) -> String { /// ``` fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Option { match strategy { - DuplicateUriMatchType::Domain => Url::parse(uri).ok().and_then(|url| { - url.host_str() - .and_then(|hostname| psl::domain_str(hostname).map(|domain| domain.to_string())) - }), - DuplicateUriMatchType::Hostname => Url::parse(uri) - .ok() - .and_then(|url| url.host_str().map(|hostname| hostname.to_string())), - DuplicateUriMatchType::Host => Url::parse(uri).ok().map(|url| { - let hostname = url.host_str().map(|hostname| hostname.to_string()); - format!( - "{}:{}", - hostname.unwrap_or_default(), - url.port().unwrap_or_default() - ) - }), + DuplicateUriMatchType::Domain => { + let url = Url::parse(uri).ok()?; + let host = url.host_str()?; + let host_lower = host.to_ascii_lowercase(); + psl::domain_str(&host_lower).map(str::to_string) + } + DuplicateUriMatchType::Hostname => { + let url = Url::parse(uri).ok()?; + let host = url.host_str()?; + Some(host.to_string()) + } + DuplicateUriMatchType::Host => { + let url = Url::parse(uri).ok()?; + let host = url.host_str()?; + let port = url.port().unwrap_or_default(); + Some(format!("{host}:{port}")) + } DuplicateUriMatchType::Exact => Some(uri.to_string()), } } - /// Build groups of duplicated ciphers /// /// Buckets (size >= 2 kept): @@ -128,10 +128,14 @@ pub fn find_duplicate_sets<'a>( } } - // (BucketKind, grouping_key) -> Vec<&Cipher> let mut buckets: HashMap<(BucketKind, String), Vec<&'a Cipher>> = HashMap::new(); for cipher in ciphers.iter() { + // Skip ciphers without a stable ID; they cannot participate in duplicate grouping. + if cipher.id.is_none() { + continue; + } + // Extract username (if login) and list of URIs let (username, uri_strings): (String, Vec) = if let Some(login) = &cipher.login { let username = login @@ -157,7 +161,7 @@ pub fn find_duplicate_sets<'a>( }; let has_username = !username.is_empty(); - // Username + URI buckets (dedupe normalized URIs per cipher) + // Username + URI buckets (avoid redundant URIs for each cipher) if has_username && !uri_strings.is_empty() { let mut per_cipher_seen: HashSet = HashSet::new(); for raw_uri in uri_strings.iter() { @@ -195,26 +199,20 @@ pub fn find_duplicate_sets<'a>( } // Helper to produce a stable, order-independent membership signature. - // Prefer stable cipher IDs; fall back to pointer addresses (prefixed) only if an ID is absent. fn signature(ciphers: &[&Cipher]) -> String { let mut ids: Vec = ciphers .iter() .map(|c| { - if let Some(id) = c.id.as_ref() { - id.to_string() - } else { - // Defensive fallback uses a pointer to cipher memory address (avoids unwrap / - // expect) Prefix with _ptr to avoid accidental collision - // with a real UUID string. - format!("_ptr{:016x}", *c as *const Cipher as usize) - } + c.id.as_ref().map(|id| id.to_string()).expect( + "cipher lacking id did not trigger 'continue' and was present in iteration", + ) }) .collect(); ids.sort_unstable(); ids.join("|") } - // signature -> (precedence, BucketKind, grouping_key, members) + // key: signature -> value: (precedence, BucketKind, grouping_key, members) let mut strongest_matches: HashMap)> = HashMap::new(); @@ -630,13 +628,13 @@ mod tests { } #[test] - fn test_missing_ids_still_group_using_pointer_fallback() { + fn test_ciphers_without_ids_are_ignored() { + // Both ciphers lack IDs so they cannot participate; no duplicate set produced. let c1 = make_note_cipher(None, "SameName"); let c2 = make_note_cipher(None, " same name "); let ciphers = vec![c1, c2]; let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); - assert_eq!(sets.len(), 1); - assert!(sets[0].key.contains("SameName") || sets[0].key.contains("same name")); + assert!(sets.is_empty()); } #[test] From 0f16b9588c741909f2e47f980382f2b6390fd199 Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:05:54 -0700 Subject: [PATCH 06/10] Added duplicate detection logic (mostly) matching web vault with test coverage --- .../src/duplicate_detection.rs | 289 +++++++++++------- 1 file changed, 185 insertions(+), 104 deletions(-) diff --git a/crates/bitwarden-vault/src/duplicate_detection.rs b/crates/bitwarden-vault/src/duplicate_detection.rs index 556bc766d..f3c89c37d 100644 --- a/crates/bitwarden-vault/src/duplicate_detection.rs +++ b/crates/bitwarden-vault/src/duplicate_detection.rs @@ -1,8 +1,28 @@ +//! # Duplicate cipher detection +//! +//! Duplicate detection utilities for grouping related [`CipherView`] values that +//! appear to represent the same real-world login. Buckets are built along three +//! axes (username+URI, username+name, and name-only) using normalization rules +//! so that visually similar entries (e.g. differing only by whitespace or +//! subdomain) coalesce. When multiple bucket types cover the exact same set of +//! ciphers, only the highest-precedence bucket (username+uri > username+name > +//! name-only) is kept to avoid redundant duplicate reports. A cipher can still +//! appear in multiple returned groups if the underlying membership differs +//! (e.g. same username across two different domains). Normalization is biased +//! toward collapsing obviously equivalent values without losing distinct data. +//! +//! ## High-level flow +//! 1. Iterate ciphers, skipping those without an id (cannot be deleted after review). +//! 2. Derive normalized `login.username`, `login.uris` (strategy dependent) and `cipher.name`. +//! 3. Insert into typed buckets keyed by a canonical composite key. +//! 4. Filter out singleton buckets; collapse identical membership by precedence. +//! 5. Produce stable, human-readable display keys for UI consumption via [`find_duplicate_sets`]. + use std::collections::{hash_map::Entry, HashMap, HashSet}; use url::Url; -use crate::Cipher; +use crate::cipher::cipher::CipherView; /// A grouping of ciphers considered duplicates under a specific display key /// that can be derived from login.username, login.uris, and cipher.name @@ -11,7 +31,7 @@ pub struct DuplicateSet<'a> { /// Human-readable key (e.g. "username+uri: alice @ example.com"). pub key: String, /// All ciphers participating in the duplicate group. - pub ciphers: Vec<&'a Cipher>, + pub ciphers: Vec<&'a CipherView>, } /// Strategy for determining whether two login URIs should be considered duplicates @@ -19,14 +39,14 @@ pub struct DuplicateSet<'a> { /// The strategies progressively narrow what is considered a match: /// * Domain: compares only the registrable domain (e.g. `sub.example.co.uk` -> `example.co.uk`). /// * Hostname: compares the full hostname without port (e.g. `sub.example.co.uk`). -/// * Host: compares hostname and specified port (when present) +/// * Host: compares hostname and an explicitly specified port (omits default/implicit ports) /// * Exact: compares the full original URI string verbatim. pub enum DuplicateUriMatchType { /// Match by the effective registrable domain portion of the host. Domain, /// Match by the full hostname (subdomains preserved), excluding any port. Hostname, - /// Match by hostname plus a port (if present). + /// Match by hostname plus a port (omits default/implicit ports) Host, /// Match by the exact original URI string with no normalization applied. Exact, @@ -39,12 +59,16 @@ pub enum DuplicateUriMatchType { /// /// Returns a newly allocated `String` containing the normalized form. fn normalize_name_for_matching(name: &str) -> String { - let normalized = name - .chars() - .filter(|c| !c.is_whitespace()) - .collect::() - .to_lowercase(); - normalized + // Allocate once; skip whitespace and push lowercase chars. + let mut out = String::with_capacity(name.len()); + for ch in name.chars() { + if !ch.is_whitespace() { + for lower in ch.to_lowercase() { + out.push(lower); + } + } + } + out } /// Normalize a URI according to the chosen duplicate matching strategy. @@ -52,15 +76,25 @@ fn normalize_name_for_matching(name: &str) -> String { /// Returns an `Option` where: /// * Domain uses the public suffix list (psl) to collapse `a.b.example.co.uk` -> `example.co.uk`. /// * Hostname preserves the full hostname exactly as parsed (no port). -/// * Host appends port (if present) to the hostname. +/// * Host appends the explicit port only when one is specified in the URI (see limitations) /// * Exact performs no normalization /// +/// Limitations when using 'Host' strategy: +/// Unlike the URL parsing library used in the web vault +/// [ https://nodejs.org/api/url.html#the-whatwg-url-api ] +/// the url crate will strip explicit port numbers +/// matching the default port for a given url scheme: +/// * http://some.domain:80 => some.domain (http default port 80 is not retained) +/// * https://some.domain:443 => some.domain (https default port 443 is not retained) +/// * https://some.domain:4444 => some.domain:4444 +/// /// Examples: /// ```text /// Strategy=Domain: https://app.eu.example.com/login => Some("example.com") /// Strategy=Hostname: https://app.eu.example.com/login => Some("app.eu.example.com") -/// Strategy=Host: https://app.eu.example.com/login => Some("app.eu.example.com:443") -/// Strategy=Host: http://example.com:80 => Some("example.com:80") +/// Strategy=Host: https://app.eu.example.com/login => Some("app.eu.example.com") (no port provided) +/// Strategy=Host: https://app.eu.example.com:443/ => Some("example.com") (default port for scheme is lost) +/// Strategy=Host: https://app.eu.example.com:4444/ => Some("example.com:4444") /// Strategy=Exact: not a uri => Some("not a uri") /// Strategy=Domain: not a uri => None (parse fails) /// ``` @@ -69,6 +103,11 @@ fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Op DuplicateUriMatchType::Domain => { let url = Url::parse(uri).ok()?; let host = url.host_str()?; + // Treat raw IP addresses (v4 or v6) as non-domain (no registrable domain to compare) + // This is another divergence from original web vault implementation + if host.parse::().is_ok() { + return None; + } let host_lower = host.to_ascii_lowercase(); psl::domain_str(&host_lower).map(str::to_string) } @@ -80,8 +119,11 @@ fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Op DuplicateUriMatchType::Host => { let url = Url::parse(uri).ok()?; let host = url.host_str()?; - let port = url.port().unwrap_or_default(); - Some(format!("{host}:{port}")) + if let Some(port) = url.port() { + Some(format!("{host}:{port}")) + } else { + Some(host.to_string()) + } } DuplicateUriMatchType::Exact => Some(uri.to_string()), } @@ -98,19 +140,20 @@ fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Op /// * Names: [normalize_name_for_matching] (whitespace removed, lowercase) /// /// When different buckets contain the exact same cipher membership, only the -/// highest‑precedence bucket is retained (username+uri > username+name > name-only). +/// highest-precedence bucket is retained (username+uri > username+name > name-only). /// /// Display key formats: /// * username+uri: @ /// * username+name: & -/// * username+name: & (blank username for name-only) +/// * name-only: & (blank username) /// /// A cipher can appear in multiple returned sets if it legitimately matches /// distinct groups (e.g., the same username across two distinct URIs). pub fn find_duplicate_sets<'a>( - ciphers: &'a [Cipher], + ciphers: &'a [CipherView], strategy: DuplicateUriMatchType, ) -> Vec> { + const KEY_SEP: &str = "||"; // separator between composite key parts #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] enum BucketKind { UsernameUri, @@ -128,10 +171,9 @@ pub fn find_duplicate_sets<'a>( } } - let mut buckets: HashMap<(BucketKind, String), Vec<&'a Cipher>> = HashMap::new(); + let mut buckets: HashMap<(BucketKind, String), Vec<&'a CipherView>> = HashMap::new(); for cipher in ciphers.iter() { - // Skip ciphers without a stable ID; they cannot participate in duplicate grouping. if cipher.id.is_none() { continue; } @@ -141,17 +183,15 @@ pub fn find_duplicate_sets<'a>( let username = login .username .as_ref() - .map(|u| u.to_string()) - .unwrap_or_default() - .trim() - .to_string(); + .map(|u| u.trim().to_string()) + .unwrap_or_default(); let uris = login .uris .as_ref() .map(|all_uris| { all_uris .iter() - .filter_map(|curr_uri| curr_uri.uri.as_ref().map(|e| e.to_string())) + .filter_map(|curr_uri| curr_uri.uri.clone()) .collect::>() }) .unwrap_or_default(); @@ -168,7 +208,10 @@ pub fn find_duplicate_sets<'a>( if let Some(norm_uri) = normalize_uri_for_matching(raw_uri, &strategy) { if per_cipher_seen.insert(norm_uri.clone()) { buckets - .entry((BucketKind::UsernameUri, format!("{username}||{norm_uri}"))) + .entry(( + BucketKind::UsernameUri, + format!("{username}{KEY_SEP}{norm_uri}"), + )) .or_default() .push(cipher); } @@ -177,7 +220,7 @@ pub fn find_duplicate_sets<'a>( } // Name-based buckets - let raw_name = cipher.name.to_string(); + let raw_name = cipher.name.clone(); let trimmed_name = raw_name.trim(); if !trimmed_name.is_empty() { let norm_name = normalize_name_for_matching(trimmed_name); @@ -185,7 +228,10 @@ pub fn find_duplicate_sets<'a>( // guard in case normalization strips everything if has_username { buckets - .entry((BucketKind::UsernameName, format!("{username}||{norm_name}"))) + .entry(( + BucketKind::UsernameName, + format!("{username}{KEY_SEP}{norm_name}"), + )) .or_default() .push(cipher); } else { @@ -199,21 +245,20 @@ pub fn find_duplicate_sets<'a>( } // Helper to produce a stable, order-independent membership signature. - fn signature(ciphers: &[&Cipher]) -> String { - let mut ids: Vec = ciphers - .iter() - .map(|c| { - c.id.as_ref().map(|id| id.to_string()).expect( - "cipher lacking id did not trigger 'continue' and was present in iteration", - ) - }) - .collect(); + // All ciphers inserted into buckets have an id (guard enforced earlier). + fn signature(ciphers: &[&CipherView]) -> String { + let mut ids: Vec = Vec::with_capacity(ciphers.len()); + for c in ciphers.iter() { + if let Some(id) = &c.id { + ids.push(id.to_string()); + } + } ids.sort_unstable(); ids.join("|") } // key: signature -> value: (precedence, BucketKind, grouping_key, members) - let mut strongest_matches: HashMap)> = + let mut strongest_matches: HashMap)> = HashMap::new(); for ((kind, key), members) in buckets.into_iter() { @@ -238,39 +283,20 @@ pub fn find_duplicate_sets<'a>( let mut sets: Vec = strongest_matches .into_values() .map(|(_p, kind, key, members)| { + let first_name_trimmed = members + .first() + .map(|c| c.name.trim().to_string()) + .unwrap_or_default(); let display = match kind { - BucketKind::UsernameUri => { - if let Some((user, uri_part)) = key.split_once("||") { - format!("username+uri: {user} @ {uri_part}") - } else { - format!("username+uri: {key}") - } - } - BucketKind::UsernameName => { - if let Some((user, _canon)) = key.split_once("||") { - let display_name = members - .first() - .map(|c| c.name.to_string()) - .unwrap_or_default(); - let trimmed = display_name.trim(); - format!("username+name: {user} & {trimmed}") - } else { - let display_name = members - .first() - .map(|c| c.name.to_string()) - .unwrap_or_default(); - let trimmed = display_name.trim(); - format!("username+name: & {trimmed}") - } - } - BucketKind::NameOnly => { - let display_name = members - .first() - .map(|c| c.name.to_string()) - .unwrap_or_default(); - let trimmed = display_name.trim(); - format!("username+name: & {trimmed}") - } + BucketKind::UsernameUri => key + .split_once(KEY_SEP) + .map(|(user, uri_part)| format!("username+uri: {user} @ {uri_part}")) + .unwrap_or_else(|| format!("username+uri: {key}")), + BucketKind::UsernameName => key + .split_once(KEY_SEP) + .map(|(user, _)| format!("username+name: {user} & {first_name_trimmed}")) + .unwrap_or_else(|| format!("username+name: & {first_name_trimmed}")), + BucketKind::NameOnly => format!("username+name: & {first_name_trimmed}"), }; DuplicateSet { key: display, @@ -285,38 +311,32 @@ pub fn find_duplicate_sets<'a>( #[cfg(test)] mod tests { - use std::str::FromStr; - - use bitwarden_crypto::EncString; use chrono::Utc; use super::*; - use crate::{ - cipher::{ - cipher::{CipherRepromptType, CipherType}, - login::{Login, LoginUri}, - }, - Cipher, - }; // needed for EncString::from_str - - // Helper to build a login cipher + use crate::cipher::{ + cipher::{CipherRepromptType, CipherType, CipherView}, + login::{LoginUriView, LoginView}, + }; + + // helper to construct a CipherView for testing fn make_login_cipher( id: Option<&str>, username: Option<&str>, uris: &[&str], name: &str, - ) -> Cipher { - Cipher { - id: id.map(|s| s.parse().unwrap()), + ) -> CipherView { + CipherView { + id: id.map(|s| s.parse().expect("valid UUID literal")), organization_id: None, folder_id: None, collection_ids: vec![], key: None, - name: EncString::from_str(name).unwrap(), + name: name.to_string(), notes: None, r#type: CipherType::Login, - login: Some(Login { - username: username.map(|u| EncString::from_str(u).unwrap()), + login: Some(LoginView { + username: username.map(|u| u.to_string()), password: None, password_revision_date: None, uris: if uris.is_empty() { @@ -324,8 +344,8 @@ mod tests { } else { Some( uris.iter() - .map(|u| LoginUri { - uri: Some(EncString::from_str(u).unwrap()), + .map(|u| LoginUriView { + uri: Some(u.to_string()), r#match: None, uri_checksum: None, }) @@ -356,15 +376,15 @@ mod tests { } } - // Helper to build non-login cipher (SecureNote) - fn make_note_cipher(id: Option<&str>, name: &str) -> Cipher { - Cipher { - id: id.map(|s| s.parse().unwrap()), + // Helper to build non-login cipher view (SecureNote) + fn make_note_cipher(id: Option<&str>, name: &str) -> CipherView { + CipherView { + id: id.map(|s| s.parse().expect("valid UUID literal")), organization_id: None, folder_id: None, collection_ids: vec![], key: None, - name: EncString::from_str(name).unwrap(), + name: name.to_string(), notes: None, r#type: CipherType::SecureNote, login: None, @@ -441,8 +461,6 @@ mod tests { normalize_uri_for_matching(uri, &DuplicateUriMatchType::Hostname), Some("app.example.com".to_string()) ); - // Host strategy uses explicit port only, defaulting to 0 if absent (per current - // implementation) assert_eq!( normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), Some("app.example.com:8443".to_string()) @@ -450,11 +468,72 @@ mod tests { } #[test] - fn test_normalize_uri_host_no_explicit_port_results_zero() { - let uri = "https://example.com/path"; // no explicit port -> :0 by implementation + fn test_normalize_uri_host_no_explicit_port() { + let uri = "https://example.com/path"; // no explicit port -> hostname only + assert_eq!( + normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), + Some("example.com".to_string()) + ); + } + + /* + * Due to limitations in the url crate, port cannot be obtained when it matches + * the default ports for provided schemes. This is a divergence from the URL + * parsing library used in the web vault: https://nodejs.org/api/url.html#the-whatwg-url-api + * but should be considered an edge case (most users would not supply shceme & defauult port + * and default port shouldn't be included by browser for http & https schemes) + #[test] + fn test_normalize_uri_host_explicit_default_port() { + let uri = "https://example.com:443/path"; // explicit default port retained + assert_eq!( + normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), + Some("example.com:443".to_string()) + ); + } + + #[test] + fn test_normalize_uri_host_explicit_http_default_port() { + let uri = "http://example.com:80/path"; // explicit default http port retained + assert_eq!( + normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), + Some("example.com:80".to_string()) + ); + } + + #[test] + fn test_normalize_uri_host_ipv6_explicit_default_port() { + let uri = "https://[2001:db8::1]:443/"; // explicit default https port on IPv6 + assert_eq!( + normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), + Some("[2001:db8::1]:443".to_string()) + ); + } + + #[test] + fn test_normalize_uri_host_userinfo_no_port() { + let uri = "https://user:pass@example.com/path"; // userinfo present, no explicit port + assert_eq!( + normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), + Some("example.com".to_string()) + ); + } + */ + + #[test] + fn test_normalize_uri_host_ipv6_non_default_port() { + let uri = "https://[2001:db8::1]:8443/"; // IPv6 with non-default port assert_eq!( normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), - Some("example.com:0".to_string()) + Some("[2001:db8::1]:8443".to_string()) + ); + } + + #[test] + fn test_normalize_uri_host_userinfo_explicit_non_default_port() { + let uri = "https://user:pass@example.com:8443/path"; // userinfo with explicit non-default port + assert_eq!( + normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), + Some("example.com:8443".to_string()) ); } @@ -530,7 +609,7 @@ mod tests { let ciphers = vec![n1, n2]; let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); assert_eq!(sets.len(), 1); - assert!(sets[0].key.contains("Shared Note") || sets[0].key.contains("shared note")); + assert!(sets[0].key.starts_with("username+name:")); } #[test] @@ -556,9 +635,11 @@ mod tests { let ciphers = vec![a, b, c]; let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); assert_eq!(sets.len(), 2); - let multi_appearances = sets - .iter() - .filter(|s| s.ciphers.iter().any(|c| c.name.to_string() == "Multi")); + let multi_appearances = sets.iter().filter(|s| { + s.ciphers.iter().any(|c| { + c.id.map(|id| id.to_string()) == Some("55555555-5555-5555-5555-555555555555".into()) + }) + }); assert_eq!(multi_appearances.count(), 2); } From 85514e8defb706c1cd0d25d9251faa2e51cb5482 Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Fri, 29 Aug 2025 18:32:19 -0700 Subject: [PATCH 07/10] spell check before submitting PR --- .../src/duplicate_detection.rs | 41 +++++++++++-------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/crates/bitwarden-vault/src/duplicate_detection.rs b/crates/bitwarden-vault/src/duplicate_detection.rs index f3c89c37d..dd72a6a29 100644 --- a/crates/bitwarden-vault/src/duplicate_detection.rs +++ b/crates/bitwarden-vault/src/duplicate_detection.rs @@ -3,16 +3,26 @@ //! Duplicate detection utilities for grouping related [`CipherView`] values that //! appear to represent the same real-world login. Buckets are built along three //! axes (username+URI, username+name, and name-only) using normalization rules -//! so that visually similar entries (e.g. differing only by whitespace or +//! so that visually similar entries (e.g., differing only by whitespace or //! subdomain) coalesce. When multiple bucket types cover the exact same set of //! ciphers, only the highest-precedence bucket (username+uri > username+name > //! name-only) is kept to avoid redundant duplicate reports. A cipher can still //! appear in multiple returned groups if the underlying membership differs -//! (e.g. same username across two different domains). Normalization is biased +//! (e.g., the same username across two different domains). Normalization is biased //! toward collapsing obviously equivalent values without losing distinct data. //! -//! ## High-level flow -//! 1. Iterate ciphers, skipping those without an id (cannot be deleted after review). +//! The logic in this module was originally implemented in the web app +//! [ https://github.com/bitwarden/clients/pull/15967 ] +//! and adapted to the Rust API. Some notable divergences: +//! * The Domain [ DuplicateUriMatchType ] does not support IPv4 and IPv6 addresses +//! in this module but does so in the Web app +//! * The Host [ DuplicateUriMatchType ] does not support port numbers when they match the default +//! for a given scheme (80 for http, 443 for https). Only port numbers not matching a scheme's +//! default will be parsed and retained. This is due to limitations in the url crate, and could +//! likely be fixed using regular expressions. +//! +//! ## Logical flow: +//! 1. Iterate over CipherViews, skipping those without an id (cannot be deleted after review). //! 2. Derive normalized `login.username`, `login.uris` (strategy dependent) and `cipher.name`. //! 3. Insert into typed buckets keyed by a canonical composite key. //! 4. Filter out singleton buckets; collapse identical membership by precedence. @@ -79,9 +89,9 @@ fn normalize_name_for_matching(name: &str) -> String { /// * Host appends the explicit port only when one is specified in the URI (see limitations) /// * Exact performs no normalization /// -/// Limitations when using 'Host' strategy: -/// Unlike the URL parsing library used in the web vault -/// [ https://nodejs.org/api/url.html#the-whatwg-url-api ] +/// # Limitations when using 'Host' strategy: +/// Unlike the URL parsing library used in the web app +/// https://nodejs.org/api/url.html#the-whatwg-url-api /// the url crate will strip explicit port numbers /// matching the default port for a given url scheme: /// * http://some.domain:80 => some.domain (http default port 80 is not retained) @@ -104,7 +114,7 @@ fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Op let url = Url::parse(uri).ok()?; let host = url.host_str()?; // Treat raw IP addresses (v4 or v6) as non-domain (no registrable domain to compare) - // This is another divergence from original web vault implementation + // This is another divergence from the original web app implementation if host.parse::().is_ok() { return None; } @@ -279,7 +289,7 @@ pub fn find_duplicate_sets<'a>( } } - // Convert to DuplicateSet with Web Vault display key formatting + // Convert to DuplicateSet with Web app display key formatting let mut sets: Vec = strongest_matches .into_values() .map(|(_p, kind, key, members)| { @@ -479,8 +489,8 @@ mod tests { /* * Due to limitations in the url crate, port cannot be obtained when it matches * the default ports for provided schemes. This is a divergence from the URL - * parsing library used in the web vault: https://nodejs.org/api/url.html#the-whatwg-url-api - * but should be considered an edge case (most users would not supply shceme & defauult port + * parsing library used in the web app: https://nodejs.org/api/url.html#the-whatwg-url-api + * but should be considered an edge case (most users would not supply scheme and default port * and default port shouldn't be included by browser for http & https schemes) #[test] fn test_normalize_uri_host_explicit_default_port() { @@ -503,21 +513,20 @@ mod tests { #[test] fn test_normalize_uri_host_ipv6_explicit_default_port() { let uri = "https://[2001:db8::1]:443/"; // explicit default https port on IPv6 - assert_eq!( - normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), + assert_eq!(normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), Some("[2001:db8::1]:443".to_string()) ); } + */ #[test] fn test_normalize_uri_host_userinfo_no_port() { - let uri = "https://user:pass@example.com/path"; // userinfo present, no explicit port + let uri = "https://user:pass@example.com/path"; // userinfo present, no explicit port assert_eq!( normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), Some("example.com".to_string()) ); } - */ #[test] fn test_normalize_uri_host_ipv6_non_default_port() { @@ -710,7 +719,7 @@ mod tests { #[test] fn test_ciphers_without_ids_are_ignored() { - // Both ciphers lack IDs so they cannot participate; no duplicate set produced. + // Both ciphers lack IDs, so they cannot participate; no duplicate set produced. let c1 = make_note_cipher(None, "SameName"); let c2 = make_note_cipher(None, " same name "); let ciphers = vec![c1, c2]; From e522781fd873888ef6cd9fffaf08590ee0e445c5 Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Sat, 30 Aug 2025 05:40:37 -0700 Subject: [PATCH 08/10] clarified comments before PR --- crates/bitwarden-vault/src/duplicate_detection.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/bitwarden-vault/src/duplicate_detection.rs b/crates/bitwarden-vault/src/duplicate_detection.rs index dd72a6a29..3ea055cac 100644 --- a/crates/bitwarden-vault/src/duplicate_detection.rs +++ b/crates/bitwarden-vault/src/duplicate_detection.rs @@ -18,7 +18,7 @@ //! in this module but does so in the Web app //! * The Host [ DuplicateUriMatchType ] does not support port numbers when they match the default //! for a given scheme (80 for http, 443 for https). Only port numbers not matching a scheme's -//! default will be parsed and retained. This is due to limitations in the url crate, and could +//! default will be parsed and retained. This is due to limitations in the url crate and could //! likely be fixed using regular expressions. //! //! ## Logical flow: @@ -89,7 +89,7 @@ fn normalize_name_for_matching(name: &str) -> String { /// * Host appends the explicit port only when one is specified in the URI (see limitations) /// * Exact performs no normalization /// -/// # Limitations when using 'Host' strategy: +/// # Limitations: /// Unlike the URL parsing library used in the web app /// https://nodejs.org/api/url.html#the-whatwg-url-api /// the url crate will strip explicit port numbers @@ -97,6 +97,9 @@ fn normalize_name_for_matching(name: &str) -> String { /// * http://some.domain:80 => some.domain (http default port 80 is not retained) /// * https://some.domain:443 => some.domain (https default port 443 is not retained) /// * https://some.domain:4444 => some.domain:4444 +/// This applies to the Host strategy only. +/// +/// Raw IP addresses (IPv4 and IPv6) will not be considered when the Domain strategy is used. /// /// Examples: /// ```text From 70867edd9425a309aabd418d0ce21ff8d3135226 Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Sun, 31 Aug 2025 16:32:15 -0700 Subject: [PATCH 09/10] Added test coverage & fixed clippy errors --- .../src/duplicate_detection.rs | 303 ++++++++++++++---- 1 file changed, 246 insertions(+), 57 deletions(-) diff --git a/crates/bitwarden-vault/src/duplicate_detection.rs b/crates/bitwarden-vault/src/duplicate_detection.rs index 3ea055cac..8700a2a02 100644 --- a/crates/bitwarden-vault/src/duplicate_detection.rs +++ b/crates/bitwarden-vault/src/duplicate_detection.rs @@ -14,12 +14,12 @@ //! The logic in this module was originally implemented in the web app //! [ https://github.com/bitwarden/clients/pull/15967 ] //! and adapted to the Rust API. Some notable divergences: -//! * The Domain [ DuplicateUriMatchType ] does not support IPv4 and IPv6 addresses -//! in this module but does so in the Web app +//! * The Domain [ DuplicateUriMatchType ] does not support IPv4 and IPv6 addresses in this module +//! but does so in the Web app //! * The Host [ DuplicateUriMatchType ] does not support port numbers when they match the default -//! for a given scheme (80 for http, 443 for https). Only port numbers not matching a scheme's -//! default will be parsed and retained. This is due to limitations in the url crate and could -//! likely be fixed using regular expressions. +//! for a given scheme (80 for http, 443 for https). Only port numbers not matching a scheme's +//! default will be parsed and retained. This is due to limitations in the url crate and could +//! likely be fixed using regular expressions. //! //! ## Logical flow: //! 1. Iterate over CipherViews, skipping those without an id (cannot be deleted after review). @@ -51,6 +51,7 @@ pub struct DuplicateSet<'a> { /// * Hostname: compares the full hostname without port (e.g. `sub.example.co.uk`). /// * Host: compares hostname and an explicitly specified port (omits default/implicit ports) /// * Exact: compares the full original URI string verbatim. +#[derive(Debug, Clone, Copy)] pub enum DuplicateUriMatchType { /// Match by the effective registrable domain portion of the host. Domain, @@ -97,6 +98,7 @@ fn normalize_name_for_matching(name: &str) -> String { /// * http://some.domain:80 => some.domain (http default port 80 is not retained) /// * https://some.domain:443 => some.domain (https default port 443 is not retained) /// * https://some.domain:4444 => some.domain:4444 +/// /// This applies to the Host strategy only. /// /// Raw IP addresses (IPv4 and IPv6) will not be considered when the Domain strategy is used. @@ -421,7 +423,6 @@ mod tests { } } - // ---- normalize_name_for_matching tests ---- #[test] fn test_normalize_name_empty_and_whitespace() { assert_eq!(normalize_name_for_matching(""), ""); @@ -449,8 +450,6 @@ mod tests { assert_eq!(normalize_name_for_matching(s), "namewithspaces"); } - // ---- normalize_uri_for_matching tests ---- - #[test] fn test_normalize_uri_domain_basic() { let uri = "https://sub.example.co.uk/path"; @@ -468,12 +467,17 @@ mod tests { } #[test] - fn test_normalize_uri_hostname_and_host() { + fn test_normalize_uri_hostname() { let uri = "https://app.example.com:8443/a"; assert_eq!( normalize_uri_for_matching(uri, &DuplicateUriMatchType::Hostname), Some("app.example.com".to_string()) ); + } + + #[test] + fn test_normalize_uri_host() { + let uri = "https://app.example.com:8443/a"; assert_eq!( normalize_uri_for_matching(uri, &DuplicateUriMatchType::Host), Some("app.example.com:8443".to_string()) @@ -562,16 +566,21 @@ mod tests { ); } - // ---- find_duplicate_sets tests ---- - #[test] - fn test_find_duplicate_sets_empty_input() { - let sets = find_duplicate_sets(&[], DuplicateUriMatchType::Domain); - assert!(sets.is_empty()); + fn test_find_duplicate_sets_empty_input_all_strategies() { + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&[], strategy); + assert!(sets.is_empty()); + } } #[test] - fn test_username_uri_duplicate_basic() { + fn test_username_uri_duplicate_all_strategies() { let c1 = make_login_cipher( Some("11111111-1111-1111-1111-111111111111"), Some("alice"), @@ -585,14 +594,21 @@ mod tests { "Site B", ); let ciphers = vec![c1, c2]; - let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); - assert_eq!(sets.len(), 1); // domain example.com - assert!(sets[0].key.starts_with("username+uri:")); - assert_eq!(sets[0].ciphers.len(), 2); + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert_eq!(sets.len(), 1, "strategy {:?}", strategy); + assert!(sets[0].key.starts_with("username+uri:")); + assert_eq!(sets[0].ciphers.len(), 2); + } } #[test] - fn test_username_name_precedence_lower_than_uri() { + fn test_username_name_precedence_lower_than_uri_all_strategies() { let c1 = make_login_cipher( Some("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"), Some("bob"), @@ -606,26 +622,40 @@ mod tests { "Foo", ); let ciphers = vec![c1, c2]; - let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); - assert_eq!(sets.len(), 1); - assert!(sets[0].key.starts_with("username+uri:")); + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert_eq!(sets.len(), 1, "strategy {:?}", strategy); + assert!(sets[0].key.starts_with("username+uri:")); + } } #[test] - fn test_name_only_duplicates() { + fn test_name_only_duplicates_all_strategies() { let n1 = make_note_cipher(Some("33333333-3333-3333-3333-333333333333"), "Shared Note"); let n2 = make_note_cipher( Some("44444444-4444-4444-4444-444444444444"), " shared note ", ); let ciphers = vec![n1, n2]; - let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); - assert_eq!(sets.len(), 1); - assert!(sets[0].key.starts_with("username+name:")); + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert_eq!(sets.len(), 1, "strategy {:?}", strategy); + assert!(sets[0].key.starts_with("username+name:")); + } } #[test] - fn test_cipher_in_multiple_sets_via_distinct_uris() { + fn test_cipher_in_multiple_sets_via_distinct_uris_all_strategies() { let a = make_login_cipher( Some("55555555-5555-5555-5555-555555555555"), Some("alice"), @@ -645,18 +675,26 @@ mod tests { "Two", ); let ciphers = vec![a, b, c]; - let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); - assert_eq!(sets.len(), 2); - let multi_appearances = sets.iter().filter(|s| { - s.ciphers.iter().any(|c| { - c.id.map(|id| id.to_string()) == Some("55555555-5555-5555-5555-555555555555".into()) - }) - }); - assert_eq!(multi_appearances.count(), 2); + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert_eq!(sets.len(), 2, "strategy {:?}", strategy); + let multi_appearances = sets.iter().filter(|s| { + s.ciphers.iter().any(|c| { + c.id.map(|id| id.to_string()) + == Some("55555555-5555-5555-5555-555555555555".into()) + }) + }); + assert_eq!(multi_appearances.count(), 2); + } } #[test] - fn test_identical_membership_multiple_normalized_uris_collapse() { + fn test_identical_membership_multiple_normalized_uris_collapse_all_strategies() { let c1 = make_login_cipher( Some("88888888-8888-8888-8888-888888888888"), Some("user"), @@ -670,16 +708,19 @@ mod tests { "Y", ); let ciphers = vec![c1, c2]; - let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Hostname); - assert_eq!(sets.len(), 1); - assert!( - sets[0].key.contains("user @ x.example.com") - || sets[0].key.contains("user @ y.example.com") - ); + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert_eq!(sets.len(), 1, "strategy {:?}", strategy); + } } #[test] - fn test_per_cipher_uri_deduplication() { + fn test_per_cipher_uri_deduplication_all_strategies() { let c1 = make_login_cipher( Some("10101010-1010-1010-1010-101010101010"), Some("u"), @@ -693,13 +734,20 @@ mod tests { "B", ); let ciphers = vec![c1, c2]; - let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Hostname); - assert_eq!(sets.len(), 1); - assert_eq!(sets[0].ciphers.len(), 2); + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert_eq!(sets.len(), 1, "strategy {:?}", strategy); + assert_eq!(sets[0].ciphers.len(), 2); + } } #[test] - fn test_domain_strategy_groups_subdomains() { + fn test_domain_strategy_collapses_distinct_subdomains() { let c1 = make_login_cipher( Some("12121212-1212-1212-1212-121212121212"), Some("user"), @@ -718,20 +766,30 @@ mod tests { let ciphers_hostname = vec![c1, c2]; let sets_hostname = find_duplicate_sets(&ciphers_hostname, DuplicateUriMatchType::Hostname); assert!(sets_hostname.is_empty()); + let sets_host = find_duplicate_sets(&ciphers_hostname, DuplicateUriMatchType::Host); + assert!(sets_host.is_empty()); + let sets_exact = find_duplicate_sets(&ciphers_hostname, DuplicateUriMatchType::Exact); + assert!(sets_exact.is_empty()); } #[test] - fn test_ciphers_without_ids_are_ignored() { - // Both ciphers lack IDs, so they cannot participate; no duplicate set produced. + fn test_ciphers_without_ids_are_ignored_all_strategies() { let c1 = make_note_cipher(None, "SameName"); let c2 = make_note_cipher(None, " same name "); let ciphers = vec![c1, c2]; - let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); - assert!(sets.is_empty()); + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert!(sets.is_empty(), "strategy {:?}", strategy); + } } #[test] - fn test_username_name_does_not_cross_with_name_only() { + fn test_username_name_does_not_cross_with_name_only_all_strategies() { let login = make_login_cipher( Some("14141414-1414-1414-1414-141414141414"), Some("user"), @@ -741,9 +799,140 @@ mod tests { let note1 = make_note_cipher(Some("15151515-1515-1515-1515-151515151515"), "Duplicate"); let note2 = make_note_cipher(Some("16161616-1616-1616-1616-161616161616"), "duplicate"); let ciphers = vec![login, note1, note2]; - let sets = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); - assert_eq!(sets.len(), 1); - assert!(sets[0].key.starts_with("username+name: &")); - assert_eq!(sets[0].ciphers.len(), 2); + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert_eq!(sets.len(), 1, "strategy {:?}", strategy); + assert!(sets[0].key.starts_with("username+name: &")); + assert_eq!(sets[0].ciphers.len(), 2); + } + } + + #[test] + fn test_host_strategy_distinguishes_ports() { + let c1 = make_login_cipher( + Some("17171717-1717-1717-1717-171717171717"), + Some("user"), + &["https://example.com:8443"], + "A", + ); + let c2 = make_login_cipher( + Some("18181818-1818-1818-1818-181818181818"), + Some("user"), + &["https://example.com:9443"], + "B", + ); + let ciphers = vec![c1.clone(), c2.clone()]; + // Domain & Hostname collapse (ignore differing ports) -> duplicates + let sets_domain = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); + assert_eq!(sets_domain.len(), 1); + let sets_hostname = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Hostname); + assert_eq!(sets_hostname.len(), 1); + // Host distinguishes ports -> no duplicates + let sets_host = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Host); + assert!(sets_host.is_empty()); + // Exact includes full string -> different (ports differ) -> no duplicates + let sets_exact = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Exact); + assert!(sets_exact.is_empty()); + // Control: identical ports should duplicate under Host + let c3 = make_login_cipher( + Some("19191919-1919-1919-1919-191919191919"), + Some("user"), + &["https://example.com:8443"], + "C", + ); + let ciphers_same_port = vec![c1, c3]; + let sets_host_same_port = + find_duplicate_sets(&ciphers_same_port, DuplicateUriMatchType::Host); + assert_eq!(sets_host_same_port.len(), 1); + } + + #[test] + fn test_exact_strategy_requires_full_uri_match() { + let c1 = make_login_cipher( + Some("20202020-2020-2020-2020-202020202020"), + Some("user"), + &["https://example.com"], + "A", + ); + let c2 = make_login_cipher( + Some("21212121-2121-2121-2121-212121212121"), + Some("user"), + &["https://example.com/login"], + "B", + ); + let ciphers = vec![c1, c2]; + // Domain / Hostname / Host collapse (same host) -> duplicates + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert_eq!(sets.len(), 1, "strategy {:?}", strategy); + } + // Exact uses full string -> no duplicates + let sets_exact = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Exact); + assert!(sets_exact.is_empty()); + } + + #[test] + fn test_hostname_strategy_groups_identical_full_subdomain_only() { + // Two identical subdomain hosts should group for all strategies. + let c1 = make_login_cipher( + Some("22222222-3333-4444-5555-666666666666"), + Some("user"), + &["https://login.app.example.com"], + "One", + ); + let c2 = make_login_cipher( + Some("77777777-8888-9999-aaaa-bbbbbbbbbbbb"), + Some("user"), + &["https://login.app.example.com"], + "Two", + ); + let ciphers = vec![c1, c2]; + for strategy in [ + DuplicateUriMatchType::Domain, + DuplicateUriMatchType::Hostname, + DuplicateUriMatchType::Host, + DuplicateUriMatchType::Exact, + ] { + let sets = find_duplicate_sets(&ciphers, strategy); + assert_eq!(sets.len(), 1, "strategy {:?}", strategy); + } + } + + #[test] + fn test_hostname_strategy_ipv6_grouping() { + // IPv6 host should produce duplicates under Hostname/Host/Exact but Domain (IP) ignored. + let c1 = make_login_cipher( + Some("abcdabcd-abcd-abcd-abcd-abcdabcdabcd"), + Some("user"), + &["https://[2001:db8::1]/login"], + "One", + ); + let c2 = make_login_cipher( + Some("dcba4321-dcba-4321-dcba-4321dcba4321"), + Some("user"), + &["https://[2001:db8::1]/account"], + "Two", + ); + let ciphers = vec![c1, c2]; + let sets_domain = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Domain); + assert!(sets_domain.is_empty()); + // Hostname groups (same host w/out port) + let sets_hostname = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Hostname); + assert_eq!(sets_hostname.len(), 1); + // Host groups (no port specified) + let sets_host = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Host); + assert_eq!(sets_host.len(), 1); + // Exact considers full strings different (paths differ) so no grouping + let sets_exact = find_duplicate_sets(&ciphers, DuplicateUriMatchType::Exact); + assert!(sets_exact.is_empty()); } } From 832b4b8995a951b368b7da72a9d2bff3b328444e Mon Sep 17 00:00:00 2001 From: John Harrington <84741727+harr1424@users.noreply.github.com> Date: Mon, 1 Sep 2025 16:18:31 -0700 Subject: [PATCH 10/10] minor edits to doc and code comments --- .../src/duplicate_detection.rs | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/crates/bitwarden-vault/src/duplicate_detection.rs b/crates/bitwarden-vault/src/duplicate_detection.rs index 8700a2a02..2a6215501 100644 --- a/crates/bitwarden-vault/src/duplicate_detection.rs +++ b/crates/bitwarden-vault/src/duplicate_detection.rs @@ -1,22 +1,19 @@ //! # Duplicate cipher detection //! -//! Duplicate detection utilities for grouping related [`CipherView`] values that -//! appear to represent the same real-world login. Buckets are built along three -//! axes (username+URI, username+name, and name-only) using normalization rules -//! so that visually similar entries (e.g., differing only by whitespace or -//! subdomain) coalesce. When multiple bucket types cover the exact same set of -//! ciphers, only the highest-precedence bucket (username+uri > username+name > -//! name-only) is kept to avoid redundant duplicate reports. A cipher can still -//! appear in multiple returned groups if the underlying membership differs -//! (e.g., the same username across two different domains). Normalization is biased -//! toward collapsing obviously equivalent values without losing distinct data. +//! Duplicate cipher detection utilities. +//! +//! Ciphers are checked for redundancy using combinations of fields: +//! username+URI, username+name, and name-only. Normalization rules are applied to name and URIs +//! so that similar entries coalesce. When multiple bucket types cover the exact same set of +//! ciphers, only the highest-precedence set type (username+uri > username+name > +//! name-only) is kept to avoid redundant duplicate reports. //! //! The logic in this module was originally implemented in the web app //! [ https://github.com/bitwarden/clients/pull/15967 ] //! and adapted to the Rust API. Some notable divergences: -//! * The Domain [ DuplicateUriMatchType ] does not support IPv4 and IPv6 addresses in this module +//! * The Domain [`DuplicateUriMatchType`] does not support IPv4 and IPv6 addresses in this module //! but does so in the Web app -//! * The Host [ DuplicateUriMatchType ] does not support port numbers when they match the default +//! * The Host [`DuplicateUriMatchType`] does not support port numbers when they match the default //! for a given scheme (80 for http, 443 for https). Only port numbers not matching a scheme's //! default will be parsed and retained. This is due to limitations in the url crate and could //! likely be fixed using regular expressions. @@ -70,7 +67,6 @@ pub enum DuplicateUriMatchType { /// /// Returns a newly allocated `String` containing the normalized form. fn normalize_name_for_matching(name: &str) -> String { - // Allocate once; skip whitespace and push lowercase chars. let mut out = String::with_capacity(name.len()); for ch in name.chars() { if !ch.is_whitespace() { @@ -97,9 +93,9 @@ fn normalize_name_for_matching(name: &str) -> String { /// matching the default port for a given url scheme: /// * http://some.domain:80 => some.domain (http default port 80 is not retained) /// * https://some.domain:443 => some.domain (https default port 443 is not retained) -/// * https://some.domain:4444 => some.domain:4444 +/// * https://some.domain:4444 => some.domain:4444 (explicit port 4444 is retained) /// -/// This applies to the Host strategy only. +/// The above limitation applies to the Host strategy only. /// /// Raw IP addresses (IPv4 and IPv6) will not be considered when the Domain strategy is used. /// @@ -119,7 +115,7 @@ fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Op let url = Url::parse(uri).ok()?; let host = url.host_str()?; // Treat raw IP addresses (v4 or v6) as non-domain (no registrable domain to compare) - // This is another divergence from the original web app implementation + // This is a divergence from the original web app implementation if host.parse::().is_ok() { return None; } @@ -151,14 +147,14 @@ fn normalize_uri_for_matching(uri: &str, strategy: &DuplicateUriMatchType) -> Op /// * name-only: normalized name when the username is missing /// /// Normalization: -/// * URIs: [normalize_uri_for_matching] and the provided strategy -/// * Names: [normalize_name_for_matching] (whitespace removed, lowercase) +/// * URIs: [`normalize_uri_for_matching`] and the provided strategy +/// * Names: [`normalize_name_for_matching`] (whitespace removed, lowercase) /// /// When different buckets contain the exact same cipher membership, only the /// highest-precedence bucket is retained (username+uri > username+name > name-only). /// /// Display key formats: -/// * username+uri: @ +/// * username+uri: @ /// * username+name: & /// * name-only: & (blank username) /// @@ -216,7 +212,7 @@ pub fn find_duplicate_sets<'a>( }; let has_username = !username.is_empty(); - // Username + URI buckets (avoid redundant URIs for each cipher) + // Create Username + URI buckets while avoiding redundant URIs for each cipher if has_username && !uri_strings.is_empty() { let mut per_cipher_seen: HashSet = HashSet::new(); for raw_uri in uri_strings.iter() { @@ -259,9 +255,10 @@ pub fn find_duplicate_sets<'a>( } } - // Helper to produce a stable, order-independent membership signature. + // Helper to produce a stable, order-independent membership signature + // (participating cipher ids joined and sorted) // All ciphers inserted into buckets have an id (guard enforced earlier). - fn signature(ciphers: &[&CipherView]) -> String { + fn membership_signature(ciphers: &[&CipherView]) -> String { let mut ids: Vec = Vec::with_capacity(ciphers.len()); for c in ciphers.iter() { if let Some(id) = &c.id { @@ -272,7 +269,8 @@ pub fn find_duplicate_sets<'a>( ids.join("|") } - // key: signature -> value: (precedence, BucketKind, grouping_key, members) + // A HashMap with key corresponding to a signature created by [`membership_signature`] + // and value corresponding to a (precedence, BucketKind, grouping_key, members) tuple let mut strongest_matches: HashMap)> = HashMap::new(); @@ -280,7 +278,7 @@ pub fn find_duplicate_sets<'a>( if members.len() < 2 { continue; } - let signature = signature(&members); + let signature = membership_signature(&members); let precedence = kind.precedence(); match strongest_matches.entry(signature) { Entry::Vacant(vacant) => { @@ -305,7 +303,9 @@ pub fn find_duplicate_sets<'a>( let display = match kind { BucketKind::UsernameUri => key .split_once(KEY_SEP) - .map(|(user, uri_part)| format!("username+uri: {user} @ {uri_part}")) + .map(|(user, uri)| format!("username+uri: {user} @ {uri}")) + // considering the key is composed of cipher ids, it may be more appropriate to + // return some other descriptive value as a fallback .unwrap_or_else(|| format!("username+uri: {key}")), BucketKind::UsernameName => key .split_once(KEY_SEP) @@ -342,7 +342,7 @@ mod tests { name: &str, ) -> CipherView { CipherView { - id: id.map(|s| s.parse().expect("valid UUID literal")), + id: id.map(|s| s.parse().expect("invalid UUID literal")), organization_id: None, folder_id: None, collection_ids: vec![], @@ -394,7 +394,7 @@ mod tests { // Helper to build non-login cipher view (SecureNote) fn make_note_cipher(id: Option<&str>, name: &str) -> CipherView { CipherView { - id: id.map(|s| s.parse().expect("valid UUID literal")), + id: id.map(|s| s.parse().expect("invalid UUID literal")), organization_id: None, folder_id: None, collection_ids: vec![], @@ -909,7 +909,7 @@ mod tests { #[test] fn test_hostname_strategy_ipv6_grouping() { - // IPv6 host should produce duplicates under Hostname/Host/Exact but Domain (IP) ignored. + // IPv6 host should produce duplicates under Hostname/Host/Exact, but Domain (IP) ignored. let c1 = make_login_cipher( Some("abcdabcd-abcd-abcd-abcd-abcdabcdabcd"), Some("user"),