66from typing import Iterable , Union
77
88import numpy as np
9+ import pandas as pd
910
1011from smftools .constants import BARCODE_KIT_ALIASES , LOAD_DIR , LOGGING_DIR , UMI_KIT_ALIASES
1112from smftools .logging_utils import get_logger , setup_logging
@@ -516,6 +517,8 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
516517 ########################################################################################################################
517518
518519 ################################### 4.5) Optional UMI annotation #############################################
520+ umi_sidecar = None
521+ barcode_sidecar = None
519522 if getattr (cfg , "use_umi" , False ):
520523 logger .info ("Annotating UMIs in aligned and sorted BAM before demultiplexing" )
521524
@@ -538,20 +541,20 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
538541 umi_kit_config = load_umi_config_from_yaml (umi_yaml_path )
539542 resolved_umi = resolve_umi_config (umi_kit_config , cfg )
540543
541- annotate_umi_tags_in_bam (
544+ umi_sidecar = annotate_umi_tags_in_bam (
542545 aligned_sorted_output ,
543546 use_umi = True ,
544- umi_adapters = getattr ( cfg , "umi_adapters" , None ) ,
547+ umi_kit_config = umi_kit_config ,
545548 umi_length = getattr (cfg , "umi_length" , None ),
546549 umi_search_window = getattr (cfg , "umi_search_window" , 200 ),
547- umi_adapter_matcher = getattr (cfg , "umi_adapter_matcher" , "exact " ),
550+ umi_adapter_matcher = getattr (cfg , "umi_adapter_matcher" , "edlib " ),
548551 umi_adapter_max_edits = resolved_umi ["umi_adapter_max_edits" ],
549552 samtools_backend = cfg .samtools_backend ,
550- umi_kit_config = umi_kit_config ,
551553 umi_ends = resolved_umi ["umi_ends" ],
552554 umi_flank_mode = resolved_umi ["umi_flank_mode" ],
553555 umi_amplicon_max_edits = resolved_umi ["umi_amplicon_max_edits" ],
554556 same_orientation = resolved_umi .get ("same_orientation" , False ),
557+ threads = cfg .threads ,
555558 )
556559 ########################################################################################################################
557560
@@ -603,7 +606,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
603606 resolved_bc = resolve_barcode_config (barcode_kit_config , cfg )
604607
605608 logger .info ("Extracting and assigning barcodes to aligned BAM using smftools backend" )
606- barcoded_bam = extract_and_assign_barcodes_in_bam (
609+ barcode_sidecar = extract_and_assign_barcodes_in_bam (
607610 aligned_sorted_output ,
608611 barcode_adapters = getattr (cfg , "barcode_adapters" , [None , None ]),
609612 barcode_references = barcode_references ,
@@ -619,10 +622,9 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
619622 barcode_kit_config = barcode_kit_config ,
620623 barcode_ends = resolved_bc ["barcode_ends" ],
621624 barcode_amplicon_gap_tolerance = resolved_bc ["barcode_amplicon_gap_tolerance" ],
625+ threads = cfg .threads ,
622626 )
623- # Update aligned_sorted_output to point to the barcoded BAM
624- aligned_sorted_output = barcoded_bam
625- logger .info (f"smftools barcode extraction complete: { barcoded_bam } " )
627+ logger .info (f"smftools barcode extraction complete: { barcode_sidecar } " )
626628 ########################################################################################################################
627629
628630 ################################### 5) Demultiplexing ######################################################################
@@ -646,6 +648,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
646648 cfg .split_path ,
647649 cfg .bam_suffix ,
648650 samtools_backend = cfg .samtools_backend ,
651+ barcode_sidecar = barcode_sidecar ,
649652 )
650653
651654 unclassified_bams = [p for p in all_bam_files if "unclassified" in p .name ]
@@ -716,7 +719,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
716719 # Annotate BM tag from bi per-end scores on each demuxed BAM
717720 for bam in bam_files :
718721 if "unclassified" not in bam .name :
719- annotate_demux_type_from_bi_tag (bam , threshold = 0.0 )
722+ annotate_demux_type_from_bi_tag (bam )
720723
721724 se_bam_files = bam_files
722725 bam_dir = cfg .split_path
@@ -956,14 +959,10 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
956959 default_tags = ["NM" , "MD" , "fn" ]
957960 if cfg .smf_modality == "direct" :
958961 default_tags .extend (["MM" , "ML" ])
959- # Add UMI tags if UMI extraction was enabled
960- if getattr (cfg , "use_umi" , False ):
961- default_tags .extend (["U1" , "U2" , "RX" ])
962- # Add barcode tags if smftools barcode extraction was used
963- if demux_backend == "smftools" and cfg .barcode_kit :
964- default_tags .extend (["BC" , "BM" , "B1" , "B2" , "B3" , "B4" , "B5" , "B6" ])
962+ # UMI tags are loaded from Parquet sidecar below (not from BAM)
963+ # Barcode tags are loaded from Parquet sidecar below (not from BAM)
965964 # Add barcode tags from dorado single-pass demux (BM annotated from bi tag)
966- elif demux_backend == "dorado" and cfg .barcode_kit and not cfg .input_already_demuxed :
965+ if demux_backend == "dorado" and cfg .barcode_kit and not cfg .input_already_demuxed :
967966 dorado_ver = _get_dorado_version ()
968967 if dorado_ver is not None and dorado_ver >= (1 , 3 , 1 ):
969968 default_tags .extend (["BC" , "BM" , "bi" ])
@@ -980,6 +979,31 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
980979 samtools_backend = cfg .samtools_backend ,
981980 )
982981
982+ # Load UMI tags from Parquet sidecar (written by annotate_umi_tags_in_bam)
983+ if getattr (cfg , "use_umi" , False ) and umi_sidecar and Path (umi_sidecar ).exists ():
984+ logger .info ("Loading UMI tags from Parquet sidecar: %s" , umi_sidecar )
985+ umi_df = pd .read_parquet (umi_sidecar ).set_index ("read_name" )
986+ umi_df = umi_df .reindex (raw_adata .obs_names )
987+ for col in ["U1" , "U2" , "RX" , "FC" , "US" , "UE" ]:
988+ if col in umi_df .columns :
989+ raw_adata .obs [col ] = umi_df [col ].values
990+ del umi_df
991+
992+ # Load barcode tags from Parquet sidecar (written by extract_and_assign_barcodes_in_bam)
993+ if (
994+ demux_backend == "smftools"
995+ and cfg .barcode_kit
996+ and barcode_sidecar
997+ and Path (barcode_sidecar ).exists ()
998+ ):
999+ logger .info ("Loading barcode tags from Parquet sidecar: %s" , barcode_sidecar )
1000+ bc_df = pd .read_parquet (barcode_sidecar ).set_index ("read_name" )
1001+ bc_df = bc_df .reindex (raw_adata .obs_names )
1002+ for col in ["BC" , "BM" , "B1" , "B2" , "B3" , "B4" , "B5" , "B6" ]:
1003+ if col in bc_df .columns :
1004+ raw_adata .obs [col ] = bc_df [col ].values
1005+ del bc_df
1006+
9831007 # Expand dorado bi array tag into individual float score columns
9841008 if "bi" in bam_tag_names :
9851009 expand_bi_tag_columns (raw_adata , bi_column = "bi" )
0 commit comments