Skip to content

Commit 8483432

Browse files
rec3141claude
andcommitted
Replace custom merge/chimera Python code with C-level dada2 implementations
dada2_denoise.py: remove 170 lines of custom merge_pairs() and is_bimera(), use dada2py.merge_pairs() (C-level NW alignment) and dada2py.remove_bimera_denovo() (C-level consensus chimera detection). remove_chimeras.py: remove 130 lines of custom sparse chimera detection, use dada2py.remove_bimera_denovo() with vectorized matrix construction. Validated at 98.3-100% Jaccard with R dada2 across 10-75 samples. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3d1ba74 commit 8483432

File tree

2 files changed

+57
-382
lines changed

2 files changed

+57
-382
lines changed

nextflow/bin/py/dada2_denoise.py

Lines changed: 32 additions & 180 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@
55
# Python mirror of dada2_denoise.R. For each sample on a plate:
66
# 1. Dereplicate forward and reverse filtered reads via dada2py
77
# 2. Apply the DADA2 denoising algorithm using pre-learned error models
8-
# 3. Merge denoised forward/reverse pairs (overlap alignment)
8+
# 3. Merge denoised forward/reverse pairs via C-level NW alignment
99
# 4. Combine all merged samples into a sequence table (pandas DataFrame)
10-
# 5. Per-plate chimera removal (abundance-ratio bimera check)
10+
# 5. Per-plate chimera removal via C-level consensus bimera detection
1111
#
1212
# Usage:
1313
# dada2_denoise.py <plate_id> <errF.pkl> <errR.pkl> <min_overlap> <cpus>
@@ -82,173 +82,9 @@
8282

8383

8484
# ---------------------------------------------------------------------------
85-
# Merge pairs implementation
85+
# Merge pairs and chimera removal use dada2's C-level implementations
86+
# via the dada2py library (py/paired.py and py/chimera.py).
8687
# ---------------------------------------------------------------------------
87-
COMPLEMENT = str.maketrans("ACGTacgt", "TGCAtgca")
88-
89-
90-
def reverse_complement(seq):
91-
"""Return the reverse complement of a DNA sequence."""
92-
return seq.translate(COMPLEMENT)[::-1]
93-
94-
95-
def find_best_overlap(fwd_seq, rev_rc, min_ov):
96-
"""Find the best overlap between the 3' end of fwd and 5' end of rev_rc.
97-
98-
Slides rev_rc along fwd to find the position with the most matches.
99-
Returns (overlap_length, n_mismatches, merged_seq) or None if no
100-
valid overlap is found.
101-
"""
102-
fwd_len = len(fwd_seq)
103-
rev_len = len(rev_rc)
104-
best_overlap = 0
105-
best_mismatches = float('inf')
106-
best_pos = -1
107-
108-
# The overlap region: the end of fwd overlaps with the start of rev_rc.
109-
# overlap_len ranges from min_ov to min(fwd_len, rev_len).
110-
max_overlap = min(fwd_len, rev_len)
111-
112-
for ov in range(min_ov, max_overlap + 1):
113-
fwd_region = fwd_seq[fwd_len - ov:]
114-
rev_region = rev_rc[:ov]
115-
116-
mismatches = sum(1 for a, b in zip(fwd_region, rev_region) if a != b)
117-
match_ratio = 1.0 - mismatches / ov
118-
119-
if match_ratio > 0.9 and (ov > best_overlap or
120-
(ov == best_overlap and
121-
mismatches < best_mismatches)):
122-
best_overlap = ov
123-
best_mismatches = mismatches
124-
best_pos = ov
125-
126-
if best_pos < 0:
127-
return None
128-
129-
# Build merged sequence: fwd non-overlap + overlap (use fwd) + rev non-overlap
130-
ov = best_overlap
131-
merged = fwd_seq + rev_rc[ov:]
132-
return (ov, best_mismatches, merged)
133-
134-
135-
def merge_pairs(fwd_denoised, rev_denoised, min_ov=10):
136-
"""Merge denoised forward and reverse reads by overlap alignment.
137-
138-
Args:
139-
fwd_denoised: dict {sequence: abundance} from dada() on forward reads
140-
rev_denoised: dict {sequence: abundance} from dada() on reverse reads
141-
min_ov: minimum overlap length
142-
143-
Returns:
144-
dict {merged_sequence: abundance}
145-
"""
146-
merged = {}
147-
148-
fwd_seqs = sorted(fwd_denoised.keys(),
149-
key=lambda s: fwd_denoised[s], reverse=True)
150-
rev_seqs = sorted(rev_denoised.keys(),
151-
key=lambda s: rev_denoised[s], reverse=True)
152-
153-
for fwd_seq in fwd_seqs:
154-
fwd_abund = fwd_denoised[fwd_seq]
155-
for rev_seq in rev_seqs:
156-
rev_abund = rev_denoised[rev_seq]
157-
158-
# Use the minimum abundance of the pair
159-
pair_abund = min(fwd_abund, rev_abund)
160-
if pair_abund <= 0:
161-
continue
162-
163-
rev_rc = reverse_complement(rev_seq)
164-
result = find_best_overlap(fwd_seq, rev_rc, min_ov)
165-
166-
if result is not None:
167-
ov, mm, merged_seq = result
168-
merged[merged_seq] = merged.get(merged_seq, 0) + pair_abund
169-
170-
return merged
171-
172-
173-
# ---------------------------------------------------------------------------
174-
# Per-plate chimera removal (bimera detection)
175-
#
176-
# A simple abundance-ratio approach: for each ASV, check if it can be
177-
# reconstructed as a chimera of two more-abundant parent ASVs. A sequence
178-
# is flagged as a bimera if its left half closely matches one parent and
179-
# its right half closely matches another.
180-
# ---------------------------------------------------------------------------
181-
def is_bimera(query, parents, min_fold=1.5, min_match_frac=0.9):
182-
"""Check if query is a bimera (chimera of two parents).
183-
184-
Splits the query at every possible breakpoint and checks if the left
185-
half matches one parent and the right half matches another parent.
186-
"""
187-
qlen = len(query)
188-
if qlen < 20:
189-
return False
190-
191-
for bp in range(10, qlen - 10):
192-
left = query[:bp]
193-
right = query[bp:]
194-
left_len = len(left)
195-
right_len = len(right)
196-
197-
for i, p1 in enumerate(parents):
198-
if len(p1) < qlen:
199-
continue
200-
# Check left half against p1
201-
p1_left = p1[:left_len]
202-
left_matches = sum(1 for a, b in zip(left, p1_left) if a == b)
203-
if left_matches / left_len < min_match_frac:
204-
continue
205-
206-
for j, p2 in enumerate(parents):
207-
if i == j:
208-
continue
209-
if len(p2) < qlen:
210-
continue
211-
# Check right half against p2 (aligned from the right end)
212-
p2_right = p2[len(p2) - right_len:]
213-
right_matches = sum(1 for a, b in zip(right, p2_right)
214-
if a == b)
215-
if right_matches / right_len >= min_match_frac:
216-
return True
217-
218-
return False
219-
220-
221-
def remove_bimeras_from_table(seqtab_dict, min_fold=1.5):
222-
"""Remove bimeric sequences from a sequence table.
223-
224-
Args:
225-
seqtab_dict: dict {sequence: total_abundance}
226-
227-
Returns:
228-
set of chimeric sequences
229-
"""
230-
# Sort by abundance descending
231-
sorted_seqs = sorted(seqtab_dict.keys(),
232-
key=lambda s: seqtab_dict[s], reverse=True)
233-
234-
chimeras = set()
235-
236-
for idx, query in enumerate(sorted_seqs):
237-
query_abund = seqtab_dict[query]
238-
# Parents must be more abundant (by min_fold)
239-
parents = [s for s in sorted_seqs[:idx]
240-
if seqtab_dict[s] >= min_fold * query_abund]
241-
242-
if len(parents) < 2:
243-
continue
244-
245-
# Limit to top 20 parents to keep runtime reasonable
246-
parents = parents[:20]
247-
248-
if is_bimera(query, parents, min_fold=min_fold):
249-
chimeras.add(query)
250-
251-
return chimeras
25288

25389

25490
# ---------------------------------------------------------------------------
@@ -271,18 +107,24 @@ def remove_bimeras_from_table(seqtab_dict, min_fold=1.5):
271107
print(f"[WARNING] {sample_name}: dada() failed: {e}")
272108
continue
273109

274-
# Extract denoised sequences (may be missing if sample is too small)
275-
fwd_denoised = ddF.get("denoised", {})
276-
rev_denoised = ddR.get("denoised", {})
277-
278-
if not fwd_denoised or not rev_denoised:
110+
# Check for denoised output
111+
if not ddF.get("cluster_seqs") or not ddR.get("cluster_seqs"):
279112
print(f"[WARNING] {sample_name}: no denoised sequences, skipping")
280113
continue
281114

282-
merged = merge_pairs(fwd_denoised, rev_denoised, min_ov=min_overlap)
115+
# Merge pairs using C-level NW alignment (matches R's mergePairs)
116+
merged_list = dada2py.merge_pairs(ddF, derepF, ddR, derepR,
117+
min_overlap=min_overlap,
118+
trim_overhang=True, verbose=False)
119+
120+
# Collect accepted merged reads into {sequence: abundance}
121+
accepted = {}
122+
for m in merged_list:
123+
if m["accept"]:
124+
accepted[m["sequence"]] = accepted.get(m["sequence"], 0) + m["abundance"]
283125

284-
if len(merged) > 0:
285-
all_merged[sample_name] = merged
126+
if accepted:
127+
all_merged[sample_name] = accepted
286128

287129
# Drop samples that produced no merged reads
288130
if len(all_merged) == 0:
@@ -311,15 +153,25 @@ def remove_bimeras_from_table(seqtab_dict, min_fold=1.5):
311153
f"{n_asvs} unique ASVs, {total_reads} total reads")
312154

313155
# ---------------------------------------------------------------------------
314-
# Per-plate chimera removal
156+
# Per-plate chimera removal using dada2's C-level consensus method
315157
# ---------------------------------------------------------------------------
316-
seq_totals = dt.groupby("sequence")["count"].sum().to_dict()
158+
all_seqs = sorted(set(s for d in all_merged.values() for s in d))
159+
sample_list = list(all_merged.keys())
160+
seq_to_idx = {s: i for i, s in enumerate(all_seqs)}
161+
162+
mat = np.zeros((len(sample_list), len(all_seqs)), dtype=np.int32)
163+
for si, sam in enumerate(sample_list):
164+
for seq, ab in all_merged[sam].items():
165+
mat[si, seq_to_idx[seq]] = ab
317166

318-
chimeric_seqs = remove_bimeras_from_table(seq_totals)
167+
seqtab = {"table": mat, "seqs": all_seqs}
168+
chim_result = dada2py.remove_bimera_denovo(seqtab, verbose=True)
319169

170+
is_chimera = np.array(chim_result["is_chimera"])
171+
chimeric_seqs = set(s for s, c in zip(all_seqs, is_chimera) if c)
320172
dt_nochim = dt[~dt["sequence"].isin(chimeric_seqs)].copy()
321173

322-
n_chimeras = len(chimeric_seqs)
174+
n_chimeras = int(is_chimera.sum())
323175
n_asvs_after = dt_nochim["sequence"].nunique()
324176
reads_after = dt_nochim["count"].sum()
325177
pct = round(reads_after / max(total_reads, 1) * 100, 1)

0 commit comments

Comments
 (0)