diff --git a/src/config.rs b/src/config.rs index 71f6f0f..a9e2a48 100644 --- a/src/config.rs +++ b/src/config.rs @@ -22,6 +22,9 @@ pub struct Config { pub unsafe_log: bool, pub require_mitm_effective: bool, pub notice_mode: NoticeMode, + pub entropy_enabled: bool, + pub entropy_threshold: f64, + pub entropy_min_len: usize, } impl Config { @@ -48,6 +51,9 @@ impl Config { unsafe_log: bool_env("KEYCLAW_UNSAFE_LOG", false), require_mitm_effective: bool_env("KEYCLAW_REQUIRE_MITM_EFFECTIVE", true), notice_mode: notice_mode_env("KEYCLAW_NOTICE_MODE", NoticeMode::Verbose), + entropy_enabled: bool_env("KEYCLAW_ENTROPY_ENABLED", true), + entropy_threshold: f64_env("KEYCLAW_ENTROPY_THRESHOLD", 3.5), + entropy_min_len: usize_env("KEYCLAW_ENTROPY_MIN_LEN", 20), } } @@ -107,6 +113,20 @@ fn notice_mode_env(key: &str, fallback: NoticeMode) -> NoticeMode { } } +fn f64_env(key: &str, fallback: f64) -> f64 { + match env::var(key) { + Ok(v) => v.trim().parse::().unwrap_or(fallback), + Err(_) => fallback, + } +} + +fn usize_env(key: &str, fallback: usize) -> usize { + match env::var(key) { + Ok(v) => v.trim().parse::().unwrap_or(fallback), + Err(_) => fallback, + } +} + fn path_env(key: &str) -> Option { match env::var(key) { Ok(v) if !v.trim().is_empty() => Some(PathBuf::from(v.trim())), @@ -301,6 +321,44 @@ mod tests { restore_env(saved); } + #[test] + fn from_env_reads_entropy_settings() { + let _guard = ENV_LOCK.lock().expect("env lock"); + let keys = [ + "KEYCLAW_ENTROPY_ENABLED", + "KEYCLAW_ENTROPY_THRESHOLD", + "KEYCLAW_ENTROPY_MIN_LEN", + ]; + let saved = capture_env(&keys); + env::set_var("KEYCLAW_ENTROPY_ENABLED", "false"); + env::set_var("KEYCLAW_ENTROPY_THRESHOLD", "4.0"); + env::set_var("KEYCLAW_ENTROPY_MIN_LEN", "30"); + let cfg = Config::from_env(); + assert!(!cfg.entropy_enabled); + assert!((cfg.entropy_threshold - 4.0).abs() < 0.001); + assert_eq!(cfg.entropy_min_len, 30); + restore_env(saved); + } + + #[test] + fn from_env_uses_entropy_defaults() { + let _guard = ENV_LOCK.lock().expect("env lock"); + let keys = [ + "KEYCLAW_ENTROPY_ENABLED", + "KEYCLAW_ENTROPY_THRESHOLD", + "KEYCLAW_ENTROPY_MIN_LEN", + ]; + let saved = capture_env(&keys); + env::remove_var("KEYCLAW_ENTROPY_ENABLED"); + env::remove_var("KEYCLAW_ENTROPY_THRESHOLD"); + env::remove_var("KEYCLAW_ENTROPY_MIN_LEN"); + let cfg = Config::from_env(); + assert!(cfg.entropy_enabled); + assert!((cfg.entropy_threshold - 3.5).abs() < 0.001); + assert_eq!(cfg.entropy_min_len, 20); + restore_env(saved); + } + fn capture_env(keys: &[&str]) -> Vec<(String, Option)> { keys.iter() .map(|key| ((*key).to_string(), env::var_os(key))) diff --git a/src/entropy.rs b/src/entropy.rs new file mode 100644 index 0000000..98e18c2 --- /dev/null +++ b/src/entropy.rs @@ -0,0 +1,224 @@ +//! Shannon entropy calculation and high-entropy token detection. +//! +//! This module provides a standalone entropy scorer that flags tokens with +//! unusually high information density — a strong heuristic for API keys, +//! passwords, and other machine-generated secrets that regex rules may miss. + +/// Compute the Shannon entropy (bits per byte) of `input`. +/// +/// Returns 0.0 for empty strings. For a uniform distribution of all 256 byte +/// values the result approaches 8.0. +pub fn shannon_entropy(input: &str) -> f64 { + let bytes = input.as_bytes(); + let len = bytes.len(); + if len == 0 { + return 0.0; + } + + let mut counts = [0u32; 256]; + for &b in bytes { + counts[b as usize] += 1; + } + + let len_f = len as f64; + let mut entropy = 0.0f64; + for &count in &counts { + if count == 0 { + continue; + } + let p = count as f64 / len_f; + entropy -= p * p.log2(); + } + + entropy +} + +/// Configuration knobs for entropy-based detection. +#[derive(Debug, Clone)] +pub struct EntropyConfig { + pub enabled: bool, + pub threshold: f64, + pub min_len: usize, +} + +impl Default for EntropyConfig { + fn default() -> Self { + Self { + enabled: true, + threshold: 3.5, + min_len: 20, + } + } +} + +/// A single high-entropy token found in the input. +pub struct EntropyMatch<'a> { + pub start: usize, + pub end: usize, + pub token: &'a str, + pub entropy: f64, +} + +/// Returns true if `ch` is a token delimiter. +fn is_delimiter(ch: char) -> bool { + matches!( + ch, + ' ' | '\t' + | '\n' + | '\r' + | '"' + | '\'' + | '`' + | '=' + | ':' + | ',' + | '{' + | '}' + | '[' + | ']' + | '(' + | ')' + | ';' + | '<' + | '>' + | '|' + | '\\' + ) +} + +/// Returns true if `token` consists entirely of ASCII lowercase letters. +fn is_all_lowercase_alpha(token: &str) -> bool { + !token.is_empty() && token.bytes().all(|b| b.is_ascii_lowercase()) +} + +/// Evaluate a single candidate token and push it into `matches` if it qualifies. +fn check_token<'a>( + input: &'a str, + start: usize, + end: usize, + min_len: usize, + threshold: f64, + matches: &mut Vec>, +) { + let token = &input[start..end]; + if token.len() < min_len { + return; + } + if is_all_lowercase_alpha(token) { + return; + } + let entropy = shannon_entropy(token); + if entropy >= threshold { + matches.push(EntropyMatch { + start, + end, + token, + entropy, + }); + } +} + +/// Find all tokens in `input` whose Shannon entropy meets or exceeds +/// `threshold` and whose length is at least `min_len`. +/// +/// Tokens are produced by splitting on common delimiters. Tokens that are +/// all-lowercase ASCII (likely English words) are skipped. Returned matches +/// carry byte offsets into the original `input`. +pub fn find_high_entropy_tokens<'a>( + input: &'a str, + min_len: usize, + threshold: f64, +) -> Vec> { + let mut matches = Vec::new(); + let mut token_start: Option = None; + + for (idx, ch) in input.char_indices() { + if is_delimiter(ch) { + if let Some(start) = token_start.take() { + check_token(input, start, idx, min_len, threshold, &mut matches); + } + } else if token_start.is_none() { + token_start = Some(idx); + } + } + + // Handle trailing token (no trailing delimiter) + if let Some(start) = token_start { + check_token(input, start, input.len(), min_len, threshold, &mut matches); + } + + matches +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn entropy_uniform_distribution_near_eight() { + // Build a string containing all 256 byte values (as a &str we need valid + // UTF-8, so we use 256 distinct printable ASCII characters repeated). + // Instead, we test with the 95 printable ASCII chars which gives ~6.57. + // For a true 256-uniform test we work at the byte level via a helper. + // shannon_entropy takes &str, so we test with printable ASCII. + // Each of 62 alphanumeric chars once → log2(62) ≈ 5.954 + let alnum: String = ('a'..='z').chain('A'..='Z').chain('0'..='9').collect(); + let e = shannon_entropy(&alnum); + assert!((e - 62f64.log2()).abs() < 0.01, "expected ~5.954, got {e}"); + } + + #[test] + fn entropy_single_repeated_char() { + let input = "aaaaaaaaaa"; + assert_eq!(shannon_entropy(input), 0.0); + } + + #[test] + fn entropy_empty_string() { + assert_eq!(shannon_entropy(""), 0.0); + } + + #[test] + fn finds_base64_like_api_key() { + let input = "token=aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v"; + let matches = find_high_entropy_tokens(input, 20, 3.5); + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].token, "aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v"); + } + + #[test] + fn skips_english_prose() { + let input = "this is a perfectly normal sentence with only regular words"; + let matches = find_high_entropy_tokens(input, 5, 3.5); + assert!( + matches.is_empty(), + "prose should not trigger entropy detection" + ); + } + + #[test] + fn skips_short_tokens() { + let input = "key=Ab1"; + let matches = find_high_entropy_tokens(input, 20, 3.5); + assert!(matches.is_empty()); + } + + #[test] + fn returns_correct_byte_offsets() { + let key = "aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v"; + let input = format!("prefix={key}"); + let matches = find_high_entropy_tokens(&input, 20, 3.5); + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].start, 7); + assert_eq!(matches[0].end, 7 + key.len()); + assert_eq!(&input[matches[0].start..matches[0].end], key); + } + + #[test] + fn skips_all_lowercase_alpha_tokens() { + // A long all-lowercase token that would otherwise exceed the entropy threshold + let input = "abcdefghijklmnopqrstuvwxyz"; + let matches = find_high_entropy_tokens(input, 5, 2.0); + assert!(matches.is_empty(), "all-lowercase tokens should be skipped"); + } +} diff --git a/src/gitleaks_rules.rs b/src/gitleaks_rules.rs index 5c92cff..36bb6a5 100644 --- a/src/gitleaks_rules.rs +++ b/src/gitleaks_rules.rs @@ -3,8 +3,11 @@ use std::path::Path; use regex::Regex; use serde::Deserialize; +use crate::entropy::EntropyConfig; use crate::errors::KeyclawError; +const ENTROPY_RULE_ID: &str = "entropy"; + /// A single compiled gitleaks rule ready for matching. pub struct Rule { pub id: String, @@ -12,12 +15,16 @@ pub struct Rule { pub keywords: Vec, /// Which capture group holds the secret (0 = full match). pub secret_group: usize, + /// Minimum Shannon entropy for the matched secret. If set, + /// matches below this threshold are discarded as likely false positives. + pub min_entropy: Option, } /// All compiled gitleaks rules. pub struct RuleSet { pub rules: Vec, pub skipped_rules: usize, + pub entropy_config: EntropyConfig, } // ── TOML deserialization shapes ────────────────────────────── @@ -37,6 +44,8 @@ struct TomlRule { keywords: Vec, #[serde(default, rename = "secretGroup")] secret_group: Option, + #[serde(default)] + entropy: Option, } // ── Loading ────────────────────────────────────────────────── @@ -72,6 +81,7 @@ impl RuleSet { regex: compiled, keywords: r.keywords.iter().map(|k| k.to_lowercase()).collect(), secret_group: r.secret_group.unwrap_or(0), + min_entropy: r.entropy, }); } Err(_) => { @@ -83,6 +93,7 @@ impl RuleSet { Ok(RuleSet { rules, skipped_rules: skipped, + entropy_config: EntropyConfig::default(), }) } @@ -121,6 +132,13 @@ impl RuleSet { continue; } + // Skip if matched secret's entropy is below the rule's threshold + if let Some(min_entropy) = rule.min_entropy { + if crate::entropy::shannon_entropy(secret) < min_entropy { + continue; + } + } + // Skip if this range overlaps with an already-found match if matches.iter().any(|m| m.start < end && start < m.end) { continue; @@ -140,6 +158,30 @@ impl RuleSet { } } + // Entropy-based detection pass + if self.entropy_config.enabled { + for em in crate::entropy::find_high_entropy_tokens( + input, + self.entropy_config.min_len, + self.entropy_config.threshold, + ) { + // Skip if overlaps with an existing regex match + if matches.iter().any(|m| m.start < em.end && em.start < m.end) { + continue; + } + // Skip if inside an existing placeholder + if inside_placeholder(input, em.start, em.end) { + continue; + } + matches.push(SecretMatch { + rule_id: ENTROPY_RULE_ID, + start: em.start, + end: em.end, + secret: em.token, + }); + } + } + // Sort by position for stable replacement order matches.sort_by_key(|m| m.start); matches @@ -203,6 +245,98 @@ regex = '[a-f0-9]{16}' ); } + #[test] + fn find_secrets_includes_entropy_matches() { + use crate::entropy::EntropyConfig; + let rules = RuleSet { + rules: Vec::new(), + skipped_rules: 0, + entropy_config: EntropyConfig::default(), + }; + let input = "token=aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v"; + let matches = rules.find_secrets(input); + assert_eq!(matches.len(), 1); + assert_eq!(matches[0].rule_id, "entropy"); + assert_eq!(matches[0].secret, "aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v"); + } + + #[test] + fn find_secrets_entropy_disabled() { + use crate::entropy::EntropyConfig; + let rules = RuleSet { + rules: Vec::new(), + skipped_rules: 0, + entropy_config: EntropyConfig { + enabled: false, + ..Default::default() + }, + }; + let input = "token=aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v"; + let matches = rules.find_secrets(input); + assert!(matches.is_empty()); + } + + #[test] + fn find_secrets_respects_per_rule_entropy_threshold() { + // Rule requires entropy >= 3.0; a low-entropy match should be skipped + let rules = RuleSet::from_toml( + r#" +[[rules]] +id = "high-entropy-only" +regex = '[a-zA-Z0-9]{20,}' +entropy = 3.0 +"#, + ) + .expect("ruleset"); + + // Low entropy: repeated pattern + let low = "abcabcabcabcabcabcabcabc"; + assert!( + rules.find_secrets(low).is_empty(), + "low-entropy match should be filtered by per-rule threshold" + ); + + // High entropy: random-looking + let high = "aB3dE5fG7hI9jK1lM3nO5pQ7rS9tU1v"; + let matches = rules.find_secrets(high); + assert_eq!(matches.len(), 1, "high-entropy match should pass threshold"); + assert_eq!(matches[0].rule_id, "high-entropy-only"); + } + + #[test] + fn find_secrets_no_entropy_threshold_accepts_all_matches() { + // Rule without entropy field — all matches accepted + let rules = RuleSet::from_toml( + r#" +[[rules]] +id = "no-threshold" +regex = '[a-zA-Z]{20,}' +"#, + ) + .expect("ruleset"); + + let input = "abcabcabcabcabcabcabcabc"; + let matches = rules.find_secrets(input); + assert_eq!( + matches.len(), + 1, + "without entropy threshold, all matches pass" + ); + } + + #[test] + fn bundled_rules_parse_entropy_field() { + let rules = RuleSet::bundled().expect("bundled rules"); + let with_entropy = rules + .rules + .iter() + .filter(|r| r.min_entropy.is_some()) + .count(); + assert!( + with_entropy >= 100, + "expected at least 100 rules with entropy thresholds, found {with_entropy}" + ); + } #[test] fn bundled_rules_load_without_skips() { let rules = RuleSet::bundled().expect("bundled rules"); diff --git a/src/launcher/bootstrap.rs b/src/launcher/bootstrap.rs index d99d27e..8bd858d 100644 --- a/src/launcher/bootstrap.rs +++ b/src/launcher/bootstrap.rs @@ -11,6 +11,7 @@ use tempfile::NamedTempFile; use crate::certgen::CaPair; use crate::config::Config; +use crate::entropy::EntropyConfig; use crate::errors::{KeyclawError, CODE_MITM_NOT_EFFECTIVE}; use crate::gitleaks_rules::RuleSet; use crate::pipeline::Processor; @@ -769,7 +770,12 @@ pub(super) fn build_processor(cfg: &Config) -> Result, KeyclawErr crate::vault::resolve_vault_passphrase(&cfg.vault_path, cfg.vault_passphrase.as_deref())?; let vault = Arc::new(Store::new(cfg.vault_path.clone(), passphrase)); - let ruleset = load_runtime_ruleset(cfg)?; + let mut ruleset = load_runtime_ruleset(cfg)?; + ruleset.entropy_config = EntropyConfig { + enabled: cfg.entropy_enabled, + threshold: cfg.entropy_threshold, + min_len: cfg.entropy_min_len, + }; crate::logging::info(&format!("{} gitleaks rules loaded", ruleset.rules.len())); @@ -1084,6 +1090,9 @@ mod tests { unsafe_log: false, require_mitm_effective: true, notice_mode: crate::redaction::NoticeMode::Verbose, + entropy_enabled: true, + entropy_threshold: 3.5, + entropy_min_len: 20, } } diff --git a/src/lib.rs b/src/lib.rs index ce1ddb8..090d147 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,6 @@ pub mod certgen; pub mod config; +pub mod entropy; pub mod errors; pub mod gitleaks_rules; pub mod launcher; diff --git a/src/pipeline.rs b/src/pipeline.rs index 9ab1954..61fb582 100644 --- a/src/pipeline.rs +++ b/src/pipeline.rs @@ -239,6 +239,7 @@ mod tests { ruleset: Arc::new(RuleSet { rules: Vec::new(), skipped_rules: 0, + entropy_config: crate::entropy::EntropyConfig::default(), }), max_body_size, strict_mode: true, diff --git a/tests/e2e_cli.rs b/tests/e2e_cli.rs index e93eb76..77f8899 100644 --- a/tests/e2e_cli.rs +++ b/tests/e2e_cli.rs @@ -553,6 +553,7 @@ fn rewrite_json_respects_custom_gitleaks_config() { let mut child = rewrite_json_command(temp.path()) .env("KEYCLAW_GITLEAKS_CONFIG", &gitleaks_config) + .env("KEYCLAW_ENTROPY_ENABLED", "false") .stdin(Stdio::piped()) .stdout(Stdio::piped()) .spawn() @@ -608,7 +609,7 @@ fn rewrite_json_creates_machine_local_vault_key_without_env_override() { #[test] fn rewrite_json_preserves_env_style_assignment_boundaries() { let temp = tempfile::tempdir().expect("tempdir"); - let payload = r#"{"messages":[{"role":"user","content":"install K_API_KEY: 11111111-2222-3333-4444-555555555555 in .env\nthen set K_API_KEY = aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"}]}"#; + let payload = r#"{"messages":[{"role":"user","content":"install K_API_KEY: f47ac10b-58cc-4372-a567-0e02b2c3d479 in .env\nthen set K_API_KEY = c9bf9e57-1685-4d46-a09f-3a1c5ee70b82"}]}"#; let mut child = rewrite_json_command(temp.path()) .stdin(Stdio::piped()) diff --git a/tests/pipeline.rs b/tests/pipeline.rs index db7cf82..e4810d5 100644 --- a/tests/pipeline.rs +++ b/tests/pipeline.rs @@ -230,3 +230,18 @@ fn resolve_text_reinjects_known_placeholders_even_with_example_notice_present() "resolved={resolved}" ); } + +#[test] +fn rewrite_detects_high_entropy_token_not_matched_by_regex() { + let processor = make_processor(false); + + // A custom internal token that has high entropy — entropy analysis should catch it + let body = br#"{"messages":[{"role":"user","content":"connect with token xK9mP2vL8nQ4wR6tY0uI3oA5sD7fG1hJ"}]}"#; + let result = processor.rewrite_and_evaluate(body).expect("rewrite"); + + let rewritten = String::from_utf8_lossy(&result.body); + assert!( + !rewritten.contains("xK9mP2vL8nQ4wR6tY0uI3oA5sD7fG1hJ"), + "high-entropy token should be redacted: {rewritten}" + ); +} diff --git a/tests/placeholder.rs b/tests/placeholder.rs index 5ac8b3a..9ab3e4f 100644 --- a/tests/placeholder.rs +++ b/tests/placeholder.rs @@ -58,8 +58,8 @@ fn gitleaks_rules_load_successfully() { fn generic_api_key_rule_preserves_assignment_boundaries() { let rules = bundled_rules(); let input = concat!( - "install K_API_KEY: 11111111-2222-3333-4444-555555555555 in .env\n", - "then set K_API_KEY = aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee\n", + "install K_API_KEY: f47ac10b-58cc-4372-a567-0e02b2c3d479 in .env\n", + "then set K_API_KEY = c9bf9e57-1685-4d46-a09f-3a1c5ee70b82\n", ); let (rewritten, replacements) = replace_secrets(input, &rules, |secret| {