55# Python mirror of dada2_denoise.R. For each sample on a plate:
66# 1. Dereplicate forward and reverse filtered reads via dada2py
77# 2. Apply the DADA2 denoising algorithm using pre-learned error models
8- # 3. Merge denoised forward/reverse pairs (overlap alignment)
8+ # 3. Merge denoised forward/reverse pairs via C-level NW alignment
99# 4. Combine all merged samples into a sequence table (pandas DataFrame)
10- # 5. Per-plate chimera removal (abundance-ratio bimera check)
10+ # 5. Per-plate chimera removal via C-level consensus bimera detection
1111#
1212# Usage:
1313# dada2_denoise.py <plate_id> <errF.pkl> <errR.pkl> <min_overlap> <cpus>
8282
8383
8484# ---------------------------------------------------------------------------
85- # Merge pairs implementation
85+ # Merge pairs and chimera removal use dada2's C-level implementations
86+ # via the dada2py library (py/paired.py and py/chimera.py).
8687# ---------------------------------------------------------------------------
87- COMPLEMENT = str .maketrans ("ACGTacgt" , "TGCAtgca" )
88-
89-
90- def reverse_complement (seq ):
91- """Return the reverse complement of a DNA sequence."""
92- return seq .translate (COMPLEMENT )[::- 1 ]
93-
94-
95- def find_best_overlap (fwd_seq , rev_rc , min_ov ):
96- """Find the best overlap between the 3' end of fwd and 5' end of rev_rc.
97-
98- Slides rev_rc along fwd to find the position with the most matches.
99- Returns (overlap_length, n_mismatches, merged_seq) or None if no
100- valid overlap is found.
101- """
102- fwd_len = len (fwd_seq )
103- rev_len = len (rev_rc )
104- best_overlap = 0
105- best_mismatches = float ('inf' )
106- best_pos = - 1
107-
108- # The overlap region: the end of fwd overlaps with the start of rev_rc.
109- # overlap_len ranges from min_ov to min(fwd_len, rev_len).
110- max_overlap = min (fwd_len , rev_len )
111-
112- for ov in range (min_ov , max_overlap + 1 ):
113- fwd_region = fwd_seq [fwd_len - ov :]
114- rev_region = rev_rc [:ov ]
115-
116- mismatches = sum (1 for a , b in zip (fwd_region , rev_region ) if a != b )
117- match_ratio = 1.0 - mismatches / ov
118-
119- if match_ratio > 0.9 and (ov > best_overlap or
120- (ov == best_overlap and
121- mismatches < best_mismatches )):
122- best_overlap = ov
123- best_mismatches = mismatches
124- best_pos = ov
125-
126- if best_pos < 0 :
127- return None
128-
129- # Build merged sequence: fwd non-overlap + overlap (use fwd) + rev non-overlap
130- ov = best_overlap
131- merged = fwd_seq + rev_rc [ov :]
132- return (ov , best_mismatches , merged )
133-
134-
135- def merge_pairs (fwd_denoised , rev_denoised , min_ov = 10 ):
136- """Merge denoised forward and reverse reads by overlap alignment.
137-
138- Args:
139- fwd_denoised: dict {sequence: abundance} from dada() on forward reads
140- rev_denoised: dict {sequence: abundance} from dada() on reverse reads
141- min_ov: minimum overlap length
142-
143- Returns:
144- dict {merged_sequence: abundance}
145- """
146- merged = {}
147-
148- fwd_seqs = sorted (fwd_denoised .keys (),
149- key = lambda s : fwd_denoised [s ], reverse = True )
150- rev_seqs = sorted (rev_denoised .keys (),
151- key = lambda s : rev_denoised [s ], reverse = True )
152-
153- for fwd_seq in fwd_seqs :
154- fwd_abund = fwd_denoised [fwd_seq ]
155- for rev_seq in rev_seqs :
156- rev_abund = rev_denoised [rev_seq ]
157-
158- # Use the minimum abundance of the pair
159- pair_abund = min (fwd_abund , rev_abund )
160- if pair_abund <= 0 :
161- continue
162-
163- rev_rc = reverse_complement (rev_seq )
164- result = find_best_overlap (fwd_seq , rev_rc , min_ov )
165-
166- if result is not None :
167- ov , mm , merged_seq = result
168- merged [merged_seq ] = merged .get (merged_seq , 0 ) + pair_abund
169-
170- return merged
171-
172-
173- # ---------------------------------------------------------------------------
174- # Per-plate chimera removal (bimera detection)
175- #
176- # A simple abundance-ratio approach: for each ASV, check if it can be
177- # reconstructed as a chimera of two more-abundant parent ASVs. A sequence
178- # is flagged as a bimera if its left half closely matches one parent and
179- # its right half closely matches another.
180- # ---------------------------------------------------------------------------
181- def is_bimera (query , parents , min_fold = 1.5 , min_match_frac = 0.9 ):
182- """Check if query is a bimera (chimera of two parents).
183-
184- Splits the query at every possible breakpoint and checks if the left
185- half matches one parent and the right half matches another parent.
186- """
187- qlen = len (query )
188- if qlen < 20 :
189- return False
190-
191- for bp in range (10 , qlen - 10 ):
192- left = query [:bp ]
193- right = query [bp :]
194- left_len = len (left )
195- right_len = len (right )
196-
197- for i , p1 in enumerate (parents ):
198- if len (p1 ) < qlen :
199- continue
200- # Check left half against p1
201- p1_left = p1 [:left_len ]
202- left_matches = sum (1 for a , b in zip (left , p1_left ) if a == b )
203- if left_matches / left_len < min_match_frac :
204- continue
205-
206- for j , p2 in enumerate (parents ):
207- if i == j :
208- continue
209- if len (p2 ) < qlen :
210- continue
211- # Check right half against p2 (aligned from the right end)
212- p2_right = p2 [len (p2 ) - right_len :]
213- right_matches = sum (1 for a , b in zip (right , p2_right )
214- if a == b )
215- if right_matches / right_len >= min_match_frac :
216- return True
217-
218- return False
219-
220-
221- def remove_bimeras_from_table (seqtab_dict , min_fold = 1.5 ):
222- """Remove bimeric sequences from a sequence table.
223-
224- Args:
225- seqtab_dict: dict {sequence: total_abundance}
226-
227- Returns:
228- set of chimeric sequences
229- """
230- # Sort by abundance descending
231- sorted_seqs = sorted (seqtab_dict .keys (),
232- key = lambda s : seqtab_dict [s ], reverse = True )
233-
234- chimeras = set ()
235-
236- for idx , query in enumerate (sorted_seqs ):
237- query_abund = seqtab_dict [query ]
238- # Parents must be more abundant (by min_fold)
239- parents = [s for s in sorted_seqs [:idx ]
240- if seqtab_dict [s ] >= min_fold * query_abund ]
241-
242- if len (parents ) < 2 :
243- continue
244-
245- # Limit to top 20 parents to keep runtime reasonable
246- parents = parents [:20 ]
247-
248- if is_bimera (query , parents , min_fold = min_fold ):
249- chimeras .add (query )
250-
251- return chimeras
25288
25389
25490# ---------------------------------------------------------------------------
@@ -271,18 +107,24 @@ def remove_bimeras_from_table(seqtab_dict, min_fold=1.5):
271107 print (f"[WARNING] { sample_name } : dada() failed: { e } " )
272108 continue
273109
274- # Extract denoised sequences (may be missing if sample is too small)
275- fwd_denoised = ddF .get ("denoised" , {})
276- rev_denoised = ddR .get ("denoised" , {})
277-
278- if not fwd_denoised or not rev_denoised :
110+ # Check for denoised output
111+ if not ddF .get ("cluster_seqs" ) or not ddR .get ("cluster_seqs" ):
279112 print (f"[WARNING] { sample_name } : no denoised sequences, skipping" )
280113 continue
281114
282- merged = merge_pairs (fwd_denoised , rev_denoised , min_ov = min_overlap )
115+ # Merge pairs using C-level NW alignment (matches R's mergePairs)
116+ merged_list = dada2py .merge_pairs (ddF , derepF , ddR , derepR ,
117+ min_overlap = min_overlap ,
118+ trim_overhang = True , verbose = False )
119+
120+ # Collect accepted merged reads into {sequence: abundance}
121+ accepted = {}
122+ for m in merged_list :
123+ if m ["accept" ]:
124+ accepted [m ["sequence" ]] = accepted .get (m ["sequence" ], 0 ) + m ["abundance" ]
283125
284- if len ( merged ) > 0 :
285- all_merged [sample_name ] = merged
126+ if accepted :
127+ all_merged [sample_name ] = accepted
286128
287129# Drop samples that produced no merged reads
288130if len (all_merged ) == 0 :
@@ -311,15 +153,25 @@ def remove_bimeras_from_table(seqtab_dict, min_fold=1.5):
311153 f"{ n_asvs } unique ASVs, { total_reads } total reads" )
312154
313155# ---------------------------------------------------------------------------
314- # Per-plate chimera removal
156+ # Per-plate chimera removal using dada2's C-level consensus method
315157# ---------------------------------------------------------------------------
316- seq_totals = dt .groupby ("sequence" )["count" ].sum ().to_dict ()
158+ all_seqs = sorted (set (s for d in all_merged .values () for s in d ))
159+ sample_list = list (all_merged .keys ())
160+ seq_to_idx = {s : i for i , s in enumerate (all_seqs )}
161+
162+ mat = np .zeros ((len (sample_list ), len (all_seqs )), dtype = np .int32 )
163+ for si , sam in enumerate (sample_list ):
164+ for seq , ab in all_merged [sam ].items ():
165+ mat [si , seq_to_idx [seq ]] = ab
317166
318- chimeric_seqs = remove_bimeras_from_table (seq_totals )
167+ seqtab = {"table" : mat , "seqs" : all_seqs }
168+ chim_result = dada2py .remove_bimera_denovo (seqtab , verbose = True )
319169
170+ is_chimera = np .array (chim_result ["is_chimera" ])
171+ chimeric_seqs = set (s for s , c in zip (all_seqs , is_chimera ) if c )
320172dt_nochim = dt [~ dt ["sequence" ].isin (chimeric_seqs )].copy ()
321173
322- n_chimeras = len ( chimeric_seqs )
174+ n_chimeras = int ( is_chimera . sum () )
323175n_asvs_after = dt_nochim ["sequence" ].nunique ()
324176reads_after = dt_nochim ["count" ].sum ()
325177pct = round (reads_after / max (total_reads , 1 ) * 100 , 1 )
0 commit comments