Skip to content

Commit d1ff14c

Browse files
authored
Merge pull request #362 from jkmckenna/0.4.1
0.4.1
2 parents 7142e33 + 5fb7d28 commit d1ff14c

8 files changed

Lines changed: 1049 additions & 1057 deletions

File tree

docs/source/tutorials/cli_usage.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ The load command builds the raw AnnData object from your raw sequencing data. It
2929
- adata.X contains binarized modification data (conversion/deaminase), or modification probabilitiesc (native).
3030
- Adds basic read-level QC annotations (Read start, end, length, mean quality).
3131
- Adds layers encoding read DNA sequences, base quality scores, base mismatches.
32-
- Maintains BAM Tags/Flags in adata.obs.
32+
- Maintains BAM tags/flags in adata.obs (UMI and barcode annotations loaded from Parquet sidecars).
3333
- Writes the raw AnnData to the canonical output path and runs MultiQC.
3434
- Optionally deletes intermediate BAMs, H5ADs, and TSVs.
3535

docs/source/tutorials/experiment_config.md

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -57,18 +57,25 @@ Below are some of the most commonly edited fields and how they affect the CLI wo
5757
- Lists are written in bracketed form, e.g. `[5mC]` or `[5mC_5hmC]`.
5858
- If you update the CSV, re-run the CLI command pointing at the updated file.
5959

60-
## BAM tags
60+
## Read annotations
6161

62-
smftools writes and/or propagates the following BAM tags when loading data. These are also loaded
63-
into `adata.obs` when `load_adata` reads BAM tags.
62+
smftools annotates reads during `load_adata` and stores the results in `adata.obs`. Standard BAM
63+
tags (e.g. `NM`, `MD`, `MM`, `ML`) are read directly from BAM files. UMI and barcode annotations
64+
are computed in parallel and written to Parquet sidecar files alongside the aligned BAM, then loaded
65+
into `adata.obs` from those sidecars. The aligned BAM itself is not modified.
6466

65-
**UMI tags**
67+
**UMI annotations** (written to `.umi_tags.parquet`)
6668

67-
- `U1`: UMI from the *top* flank (read start or read end depending on match).
68-
- `U2`: UMI from the *bottom* flank.
69+
- `U1`: Orientation-corrected UMI for the *left* reference end of the mapped fragment (forward reads: US, reverse reads: UE).
70+
- `U2`: Orientation-corrected UMI for the *right* reference end of the mapped fragment (forward reads: UE, reverse reads: US).
71+
- `US`: Positional UMI from read start (delimited `UMI_seq;slot;flank_seq`).
72+
- `UE`: Positional UMI from read end (delimited `UMI_seq;slot;flank_seq`).
6973
- `RX`: Combined UMI string (`U1-U2`, or `U1`/`U2` if only one is present).
74+
- `FC`: Flank context of the U1/U2 pair (e.g. `top-bottom`).
7075

71-
**Barcode tags (smftools demux backend)**
76+
When `threads` is set, UMI extraction is parallelized across multiple CPU cores.
77+
78+
**Barcode annotations (smftools demux backend)** (written to `.barcode_tags.parquet`)
7279

7380
- `BC`: Assigned barcode name, or `unclassified`.
7481
- `BM`: Match type (`both`, `read_start_only`, `read_end_only`, `mismatch`, `unclassified`).
@@ -79,9 +86,13 @@ into `adata.obs` when `load_adata` reads BAM tags.
7986
- `B5`: Barcode name matched at the read start (corresponds to `B1`/`B3`).
8087
- `B6`: Barcode name matched at the read end (corresponds to `B2`/`B4`).
8188

82-
**Barcode tags (dorado demux backend)**
89+
When `threads` is set, barcode extraction is parallelized across multiple CPU cores.
90+
Demultiplexing (splitting reads into per-barcode BAMs) uses the sidecar `BC` assignments.
91+
Only primary alignments are included in split BAMs and sidecar files.
92+
93+
**Barcode annotations (dorado demux backend)**
8394

84-
- `BC`: Assigned barcode name.
95+
- `BC`: Assigned barcode name (read from BAM tag).
8596
- `bi`: Dorado barcode info array (if present; expanded into columns during load).
8697

8798
Notes:

src/smftools/cli/load_adata.py

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from typing import Iterable, Union
77

88
import numpy as np
9+
import pandas as pd
910

1011
from smftools.constants import BARCODE_KIT_ALIASES, LOAD_DIR, LOGGING_DIR, UMI_KIT_ALIASES
1112
from smftools.logging_utils import get_logger, setup_logging
@@ -516,6 +517,8 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
516517
########################################################################################################################
517518

518519
################################### 4.5) Optional UMI annotation #############################################
520+
umi_sidecar = None
521+
barcode_sidecar = None
519522
if getattr(cfg, "use_umi", False):
520523
logger.info("Annotating UMIs in aligned and sorted BAM before demultiplexing")
521524

@@ -538,20 +541,20 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
538541
umi_kit_config = load_umi_config_from_yaml(umi_yaml_path)
539542
resolved_umi = resolve_umi_config(umi_kit_config, cfg)
540543

541-
annotate_umi_tags_in_bam(
544+
umi_sidecar = annotate_umi_tags_in_bam(
542545
aligned_sorted_output,
543546
use_umi=True,
544-
umi_adapters=getattr(cfg, "umi_adapters", None),
547+
umi_kit_config=umi_kit_config,
545548
umi_length=getattr(cfg, "umi_length", None),
546549
umi_search_window=getattr(cfg, "umi_search_window", 200),
547-
umi_adapter_matcher=getattr(cfg, "umi_adapter_matcher", "exact"),
550+
umi_adapter_matcher=getattr(cfg, "umi_adapter_matcher", "edlib"),
548551
umi_adapter_max_edits=resolved_umi["umi_adapter_max_edits"],
549552
samtools_backend=cfg.samtools_backend,
550-
umi_kit_config=umi_kit_config,
551553
umi_ends=resolved_umi["umi_ends"],
552554
umi_flank_mode=resolved_umi["umi_flank_mode"],
553555
umi_amplicon_max_edits=resolved_umi["umi_amplicon_max_edits"],
554556
same_orientation=resolved_umi.get("same_orientation", False),
557+
threads=cfg.threads,
555558
)
556559
########################################################################################################################
557560

@@ -603,7 +606,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
603606
resolved_bc = resolve_barcode_config(barcode_kit_config, cfg)
604607

605608
logger.info("Extracting and assigning barcodes to aligned BAM using smftools backend")
606-
barcoded_bam = extract_and_assign_barcodes_in_bam(
609+
barcode_sidecar = extract_and_assign_barcodes_in_bam(
607610
aligned_sorted_output,
608611
barcode_adapters=getattr(cfg, "barcode_adapters", [None, None]),
609612
barcode_references=barcode_references,
@@ -619,10 +622,9 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
619622
barcode_kit_config=barcode_kit_config,
620623
barcode_ends=resolved_bc["barcode_ends"],
621624
barcode_amplicon_gap_tolerance=resolved_bc["barcode_amplicon_gap_tolerance"],
625+
threads=cfg.threads,
622626
)
623-
# Update aligned_sorted_output to point to the barcoded BAM
624-
aligned_sorted_output = barcoded_bam
625-
logger.info(f"smftools barcode extraction complete: {barcoded_bam}")
627+
logger.info(f"smftools barcode extraction complete: {barcode_sidecar}")
626628
########################################################################################################################
627629

628630
################################### 5) Demultiplexing ######################################################################
@@ -646,6 +648,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
646648
cfg.split_path,
647649
cfg.bam_suffix,
648650
samtools_backend=cfg.samtools_backend,
651+
barcode_sidecar=barcode_sidecar,
649652
)
650653

651654
unclassified_bams = [p for p in all_bam_files if "unclassified" in p.name]
@@ -716,7 +719,7 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
716719
# Annotate BM tag from bi per-end scores on each demuxed BAM
717720
for bam in bam_files:
718721
if "unclassified" not in bam.name:
719-
annotate_demux_type_from_bi_tag(bam, threshold=0.0)
722+
annotate_demux_type_from_bi_tag(bam)
720723

721724
se_bam_files = bam_files
722725
bam_dir = cfg.split_path
@@ -956,14 +959,10 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
956959
default_tags = ["NM", "MD", "fn"]
957960
if cfg.smf_modality == "direct":
958961
default_tags.extend(["MM", "ML"])
959-
# Add UMI tags if UMI extraction was enabled
960-
if getattr(cfg, "use_umi", False):
961-
default_tags.extend(["U1", "U2", "RX"])
962-
# Add barcode tags if smftools barcode extraction was used
963-
if demux_backend == "smftools" and cfg.barcode_kit:
964-
default_tags.extend(["BC", "BM", "B1", "B2", "B3", "B4", "B5", "B6"])
962+
# UMI tags are loaded from Parquet sidecar below (not from BAM)
963+
# Barcode tags are loaded from Parquet sidecar below (not from BAM)
965964
# Add barcode tags from dorado single-pass demux (BM annotated from bi tag)
966-
elif demux_backend == "dorado" and cfg.barcode_kit and not cfg.input_already_demuxed:
965+
if demux_backend == "dorado" and cfg.barcode_kit and not cfg.input_already_demuxed:
967966
dorado_ver = _get_dorado_version()
968967
if dorado_ver is not None and dorado_ver >= (1, 3, 1):
969968
default_tags.extend(["BC", "BM", "bi"])
@@ -980,6 +979,31 @@ def load_adata_core(cfg, paths: AdataPaths, config_path: str | None = None):
980979
samtools_backend=cfg.samtools_backend,
981980
)
982981

982+
# Load UMI tags from Parquet sidecar (written by annotate_umi_tags_in_bam)
983+
if getattr(cfg, "use_umi", False) and umi_sidecar and Path(umi_sidecar).exists():
984+
logger.info("Loading UMI tags from Parquet sidecar: %s", umi_sidecar)
985+
umi_df = pd.read_parquet(umi_sidecar).set_index("read_name")
986+
umi_df = umi_df.reindex(raw_adata.obs_names)
987+
for col in ["U1", "U2", "RX", "FC", "US", "UE"]:
988+
if col in umi_df.columns:
989+
raw_adata.obs[col] = umi_df[col].values
990+
del umi_df
991+
992+
# Load barcode tags from Parquet sidecar (written by extract_and_assign_barcodes_in_bam)
993+
if (
994+
demux_backend == "smftools"
995+
and cfg.barcode_kit
996+
and barcode_sidecar
997+
and Path(barcode_sidecar).exists()
998+
):
999+
logger.info("Loading barcode tags from Parquet sidecar: %s", barcode_sidecar)
1000+
bc_df = pd.read_parquet(barcode_sidecar).set_index("read_name")
1001+
bc_df = bc_df.reindex(raw_adata.obs_names)
1002+
for col in ["BC", "BM", "B1", "B2", "B3", "B4", "B5", "B6"]:
1003+
if col in bc_df.columns:
1004+
raw_adata.obs[col] = bc_df[col].values
1005+
del bc_df
1006+
9831007
# Expand dorado bi array tag into individual float score columns
9841008
if "bi" in bam_tag_names:
9851009
expand_bi_tag_columns(raw_adata, bi_column="bi")

src/smftools/config/default.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,6 @@ demux_backend: dorado # smftools|dorado - smftools uses YAML-based barcode refs,
8989
barcode_both_ends: False # Require barcode match at both ends
9090
trim: False # dorado adapter and barcode removal during demultiplexing
9191
use_umi: False # Whether to detect and annotate UMIs in aligned_sorted BAM before demultiplexing
92-
umi_adapters: [null, null] # Two-slot list [left_ref_end_adapter, right_ref_end_adapter]
9392
umi_length: null # Length of each UMI (required when use_umi is true)
9493
umi_search_window: 200 # Max distance from read ends to consider an adapter match for UMI extraction
9594
umi_adapter_matcher: edlib # exact|edlib

0 commit comments

Comments
 (0)