From 1c14119511b9622ef3947043699277c806e718eb Mon Sep 17 00:00:00 2001 From: Alejandro Rivas Date: Sun, 5 Apr 2026 21:20:24 -0400 Subject: [PATCH] feat(semantic_dedup): make similarity threshold and shingle size configurable The SemanticDedup stage previously used hardcoded values for the Jaccard similarity threshold (0.8) and shingle size (3). Different content types benefit from different dedup aggressiveness -- e.g. log-heavy output works better with a looser threshold (0.6) while code blocks need stricter matching (0.9) to avoid false merges. This adds optional constructor parameters: - similarity_threshold: Jaccard cutoff (default 0.8, unchanged) - shingle_size: n-gram width for fingerprinting (default 3) - min_block_chars: minimum block length to consider (default 50) All parameters are threaded through _run_dedup and _split_blocks. Existing behavior is preserved when no arguments are passed. --- scripts/lib/fusion/semantic_dedup.py | 59 ++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/scripts/lib/fusion/semantic_dedup.py b/scripts/lib/fusion/semantic_dedup.py index 858093b..6a0a4a7 100644 --- a/scripts/lib/fusion/semantic_dedup.py +++ b/scripts/lib/fusion/semantic_dedup.py @@ -88,7 +88,7 @@ class _Block: _CODE_FENCE_RE = re.compile(r"```.*?```", re.DOTALL) -def _split_blocks(text: str) -> list[_Block]: +def _split_blocks(text: str, *, shingle_n: int = _SHINGLE_N) -> list[_Block]: """ Split *text* into logical blocks. @@ -108,7 +108,7 @@ def _in_fence(start: int, end: int) -> bool: # Add fenced code blocks as atomic blocks first. for fs, fe in fence_spans: block_text = text[fs:fe] - sh = _shingles(_tokenise(block_text)) + sh = _shingles(_tokenise(block_text), n=shingle_n) blocks.append(_Block( text=block_text, start=fs, @@ -149,7 +149,7 @@ def _in_fence(start: int, end: int) -> bool: if chunk.strip(): abs_start = seg_start + last abs_end = seg_start + m.start() - sh = _shingles(_tokenise(chunk)) + sh = _shingles(_tokenise(chunk), n=shingle_n) blocks.append(_Block( text=chunk, start=abs_start, @@ -163,7 +163,7 @@ def _in_fence(start: int, end: int) -> bool: if chunk.strip(): abs_start = seg_start + last abs_end = seg_start + len(segment) - sh = _shingles(_tokenise(chunk)) + sh = _shingles(_tokenise(chunk), n=shingle_n) blocks.append(_Block( text=chunk, start=abs_start, @@ -206,36 +206,42 @@ def as_dict(self) -> dict: } -def _run_dedup(text: str) -> tuple[str, DedupStats]: +def _run_dedup( + text: str, + *, + sim_threshold: float = _SIM_THRESHOLD, + shingle_n: int = _SHINGLE_N, + min_block_chars: int = _MIN_BLOCK_CHARS, +) -> tuple[str, DedupStats]: """ Run within-text block deduplication. + Args: + text: Source text. + sim_threshold: Jaccard similarity above which blocks are duplicates. + shingle_n: Word n-gram size for fingerprinting. + min_block_chars: Minimum block length to consider for dedup. + Returns the rewritten text and statistics. """ stats = DedupStats(tokens_before=estimate_tokens(text)) - blocks = _split_blocks(text) + blocks = _split_blocks(text, shingle_n=shingle_n) stats.blocks_total = len(blocks) if not blocks: stats.tokens_after = stats.tokens_before return text, stats - # Assign 1-based sequential numbers for use in references. - # We'll use the position in the sorted block list as the "block number". - # Blocks that are too short to consider receive no shingle set. - # First pass: mark duplicates. - # kept_blocks: list of (block_number, shingles) for blocks we are keeping. kept_blocks: list[tuple[int, frozenset]] = [] for idx, block in enumerate(blocks): block_num = idx + 1 # 1-based - short = len(block.text.strip()) < _MIN_BLOCK_CHARS + short = len(block.text.strip()) < min_block_chars no_shingles = len(block.shingles) < _MIN_SHINGLES if short or no_shingles: - # Too short / no shingles — always keep, never dedup. block.kept = True block.ref_to = None continue @@ -244,7 +250,7 @@ def _run_dedup(text: str) -> tuple[str, DedupStats]: duplicate_of: int | None = None for prev_num, prev_sh in kept_blocks: sim = _jaccard(block.shingles, prev_sh) - if sim >= _SIM_THRESHOLD: + if sim >= sim_threshold: duplicate_of = prev_num break @@ -305,19 +311,40 @@ class SemanticDedup(FusionStage): Splits text into blocks (paragraphs + fenced code blocks), fingerprints each with 3-word shingles, and replaces near-duplicate blocks - (Jaccard >= 0.8) with compact back-references. + (Jaccard >= threshold) with compact back-references. + + The similarity threshold and shingle size can be tuned via constructor + arguments to trade off between dedup aggressiveness and false positives. + Stricter thresholds (e.g. 0.9) only collapse near-identical blocks; + looser thresholds (e.g. 0.6) will catch paraphrased content at the risk + of merging blocks that differ in meaningful ways. """ name = "semantic_dedup" order = 12 # After Cortex(5), after any RLE-style stages(10), before Ionizer(15) + def __init__( + self, + similarity_threshold: float | None = None, + shingle_size: int | None = None, + min_block_chars: int | None = None, + ) -> None: + self.similarity_threshold = similarity_threshold or _SIM_THRESHOLD + self.shingle_size = shingle_size or _SHINGLE_N + self.min_block_chars = min_block_chars or _MIN_BLOCK_CHARS + def should_apply(self, ctx: FusionContext) -> bool: """Apply to any content longer than 200 characters.""" return len(ctx.content) > 200 def apply(self, ctx: FusionContext) -> FusionResult: original_tokens = estimate_tokens(ctx.content) - output, stats = _run_dedup(ctx.content) + output, stats = _run_dedup( + ctx.content, + sim_threshold=self.similarity_threshold, + shingle_n=self.shingle_size, + min_block_chars=self.min_block_chars, + ) compressed_tokens = estimate_tokens(output) markers: list[str] = []