feat(call): subset reads with long tracts for consensus

davidlougheed · davidlougheed · commit b93bcf3b8fad · 2024-12-20T12:08:37.000-05:00
diff --git a/strkit/call/call_locus.py b/strkit/call/call_locus.py
@@ -80,8 +80,12 @@
 significant_clip_threshold = 100
 significant_clip_snv_take_in = 250
 
+# above large_consensus_length, the number of reads used for consensus is limited to max_n_large_consensus_reads
+large_consensus_length: int = 2000
+max_n_large_consensus_reads: int = 20
+
 # maximum median number of bases before we can't use POA for consensus anymore due to performance:
-max_mdn_poa_length = 5000
+max_mdn_poa_length: int = 5000
 
 
 # property getters & other partials
@@ -1563,14 +1567,12 @@ def get_read_length_partition_mean(p_idx: int) -> float:
 
         if call_data and consensus:
             def _consensi_for_key(k: Literal["_tr_seq", "_start_anchor"]):
-                return map(
-                    lambda a: consensus_seq(
-                        list(map(lambda rr: read_dict_extra[rr][k], a)),
-                        logger_,
-                        max_mdn_poa_length,
-                    ),
-                    allele_reads,
-                )
+                for a in allele_reads:
+                    seqs = list(map(lambda rr: read_dict_extra[rr][k], a))
+                    if seqs and len(seqs[0]) > large_consensus_length:
+                        # if we're dealing with large sequences, use a subset of the reads to prevent stalling out.
+                        seqs = seqs[:max_n_large_consensus_reads]
+                    yield consensus_seq(seqs, logger_, max_mdn_poa_length)
 
             call_seqs.extend(_consensi_for_key("_tr_seq"))
             call_anchor_seqs.extend(_consensi_for_key("_start_anchor"))