Skip to content

Commit 4d6e755

Browse files
committed
feat: add IteratorColumnRecords for building pileups from AlignedSegments
Add new IteratorColumnRecords class that generates pileup columns from a collection of AlignedSegment objects using htslib's push-based pileup API (bam_plp_push/bam_plp64_next). Key features: - Accepts any iterable of AlignedSegments (requires coordinate-sorted order) - Supports optional reference sequence (fastafile parameter) - Includes add_reference(), has_reference(), and seq_len property - Configurable min_base_quality parameter - Uses 64-bit position types (hts_pos_t) for extended chromosome support Implementation notes: - Uses bam_plp_push/bam_plp64_next instead of callback-based approach - Records consumed during initialization for push-based API - Includes required NULL push to signal end-of-input - Leverages 64-bit APIs (bam_plp64_next, faidx_fetch_seq64) from PR #1362 - Uses opaque bam_plp_s struct (no direct field access needed) Testing: - 12 new tests covering reference support, edge cases, and parameters - Documented known limitation: minor depth differences vs samtools mpileup due to push-based vs pull-based filtering differences Changes: - Add IteratorColumnRecords class in pysam/libcalignmentfile.pyx - Update to use 64-bit pileup APIs (bam_plp64_next, faidx_fetch_seq64) - Add type stub in pysam/libcalignmentfile.pyi - Add parameterized to test dependencies for parameterized tests - Update CI workflows to install parameterized package Closes #1352
1 parent d6fbe29 commit 4d6e755

File tree

7 files changed

+439
-11
lines changed

7 files changed

+439
-11
lines changed

.github/workflows/ci.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ jobs:
2828
python-version: ${{ matrix.python-version }}
2929

3030
- name: Install prerequisite Python libraries
31-
run: pip install cython mypy pytest setuptools
31+
run: pip install cython mypy pytest setuptools parameterized
3232

3333
- name: Install Linux build prerequisites
3434
if: runner.os == 'Linux'
@@ -79,7 +79,7 @@ jobs:
7979
python-version: ${{ matrix.python-version }}
8080

8181
- name: Install prerequisite Python libraries
82-
run: pip install cython pytest
82+
run: pip install cython pytest parameterized
8383

8484
- name: Install build prerequisites
8585
if: runner.os == 'Linux'
@@ -147,7 +147,7 @@ jobs:
147147
run: python setup.py install
148148

149149
- name: Install test prerequisites via Conda
150-
run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest
150+
run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest parameterized
151151

152152
- name: Run tests
153153
run: REF_PATH=':' pytest

pysam/libcalignmentfile.pxd

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ cdef class IteratorColumn:
130130
# backwards compatibility
131131
cdef char * getSequence(self)
132132

133-
134133
cdef class IteratorColumnRegion(IteratorColumn):
135134
cdef int start
136135
cdef int stop
@@ -144,6 +143,20 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
144143
cdef class IteratorColumnAll(IteratorColumn):
145144
pass
146145

146+
cdef class IteratorColumnRecords:
147+
cdef int cnext(self)
148+
cdef bam_plp_t plp_iter
149+
cdef int tid
150+
cdef hts_pos_t pos
151+
cdef int n_plp
152+
cdef uint32_t min_base_quality
153+
cdef const bam_pileup1_t * plp
154+
cdef AlignmentHeader header
155+
cdef char * seq
156+
cdef hts_pos_t seq_len
157+
cdef faidx_t * fastafile
158+
cdef char * get_sequence(self)
159+
147160

148161
cdef class IndexedReads:
149162
cdef AlignmentFile samfile

pysam/libcalignmentfile.pyi

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,13 @@ class IteratorColumn:
210210
class IteratorColumnAll(IteratorColumn): ...
211211
class IteratorColumnAllRefs(IteratorColumn): ...
212212
class IteratorColumnRegion(IteratorColumn): ...
213+
class IteratorColumnRecords():
214+
def __iter__(self) -> IteratorColumn: ...
215+
def __next__(self) -> PileupColumn: ...
216+
@property
217+
def seq_len(self) -> int: ...
218+
def add_reference(self, fastafile: FastaFile) -> None: ...
219+
def has_reference(self) -> bool: ...
213220

214221
class SNPCall:
215222
@property

pysam/libcalignmentfile.pyx

Lines changed: 161 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
# class IteratorColumnRegion
3030
# class IteratorColumnAll
3131
# class IteratorColumnAllRefs
32+
# class IteratorColumnRecords
3233
#
3334
########################################################
3435
#
@@ -57,6 +58,8 @@
5758
########################################################
5859
import os
5960
import collections
61+
from typing import Iterable, Optional
62+
6063
try:
6164
from collections.abc import Sequence, Mapping # noqa
6265
except ImportError:
@@ -74,7 +77,7 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
7477
from pysam.libcutils cimport OSError_from_errno, encode_filename, from_string_and_size
7578
from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
7679
from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
77-
HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
80+
HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL, hts_pos_t
7881

7982
from io import StringIO
8083

@@ -86,6 +89,7 @@ __all__ = [
8689
"AlignmentHeader",
8790
"IteratorRow",
8891
"IteratorColumn",
92+
"IteratorColumnRecords",
8993
"IndexedReads"]
9094

9195
IndexStats = collections.namedtuple("IndexStats",
@@ -2783,6 +2787,162 @@ cdef class IteratorColumnAll(IteratorColumn):
27832787
self.samfile.header)
27842788

27852789

2790+
2791+
cdef class IteratorColumnRecords:
2792+
'''Iterator over columns when given a collection of :class:`~pysam.AlignedSegment`s.
2793+
2794+
For reasons of efficiency, the iterator requires the given
2795+
:class:`~pysam.AlignedSegment`s to be in coordinate sorted order.
2796+
For implementation simplicity, all the records will be consumed
2797+
from the given iterator.
2798+
2799+
For example:
2800+
2801+
f = AlignmentFile("file.bam", "rb")
2802+
result = list(IteratorColumnRecords([rec for rec in f]))
2803+
2804+
Here, `result`` will be a list of ``n`` lists of objects of type
2805+
:class:`~pysam.PileupRead`.
2806+
2807+
If the iterator is associated with a :class:`~pysam.Fastafile`
2808+
using the :meth:`add_reference` method, then the iterator will
2809+
export the current sequence via the methods :meth:`get_sequence`
2810+
and :meth:`seq_len`.
2811+
2812+
See :class:`~AlignmentFile.pileup` for kwargs to the iterator.
2813+
2814+
.. note::
2815+
2816+
**Filtering Behavior:** This iterator uses a push-based approach where
2817+
records are added via ``bam_plp_push()``. This differs from the standard
2818+
:meth:`AlignmentFile.pileup` which uses a pull-based callback approach
2819+
with htslib's internal filtering pipeline.
2820+
2821+
If you manually filter records before passing them to this iterator
2822+
(e.g., filtering by flags, mapping quality, etc.), the resulting pileup
2823+
may differ slightly from equivalent ``samtools mpileup`` output, even
2824+
with identical filtering criteria. This is because the standard pileup
2825+
pipeline performs additional processing during iteration, including:
2826+
2827+
- Base Alignment Quality (BAQ) computation
2828+
- Mapping quality adjustments
2829+
- Internal overlap handling
2830+
- Filter application timing differences
2831+
2832+
For exact ``samtools mpileup`` compatibility, use
2833+
:meth:`AlignmentFile.pileup` with the ``stepper="samtools"`` option
2834+
instead of manually filtering and using this iterator.
2835+
2836+
This iterator is best suited for cases where you need to:
2837+
2838+
- Build pileups from records already in memory
2839+
- Apply custom filtering logic not available in standard pileup
2840+
- Process records from multiple sources before pileup generation
2841+
2842+
'''
2843+
2844+
def __cinit__(self, recs: Iterable[AlignedSegment], **kwargs):
2845+
cdef FastaFile fastafile = kwargs.get("fastafile", None)
2846+
if fastafile is None:
2847+
self.fastafile = NULL
2848+
else:
2849+
self.fastafile = fastafile.fastafile
2850+
self.min_base_quality = kwargs.get("min_base_quality", 13)
2851+
self.plp_iter = <bam_plp_t>bam_plp_init(NULL, NULL)
2852+
rec: AlignedSegment
2853+
self.header: Optional[AlignmentHeader] = None
2854+
for rec in recs:
2855+
if self.header is None:
2856+
self.header = rec.header
2857+
if bam_plp_push(self.plp_iter, rec._delegate) != 0:
2858+
raise Exception("Could not add record to the iterator: {}".format(str(rec)))
2859+
# Signal end of input
2860+
if bam_plp_push(self.plp_iter, NULL) != 0:
2861+
raise Exception("Could not finalize the iterator")
2862+
2863+
def __dealloc__(self):
2864+
bam_plp_destroy(self.plp_iter)
2865+
self.plp_iter = <bam_plp_t>NULL
2866+
if self.seq != NULL:
2867+
free(self.seq)
2868+
self.seq = NULL
2869+
2870+
def __iter__(self):
2871+
return self
2872+
2873+
cdef int cnext(self):
2874+
'''perform next iteration.
2875+
'''
2876+
self.plp = <bam_pileup1_t*>bam_plp64_next(
2877+
self.plp_iter,
2878+
&self.tid,
2879+
&self.pos,
2880+
&self.n_plp
2881+
)
2882+
if self.plp == NULL:
2883+
return 0
2884+
else:
2885+
return 1
2886+
2887+
def __next__(self):
2888+
cdef int n
2889+
cdef int tid
2890+
n = self.cnext()
2891+
if n == 0:
2892+
raise StopIteration
2893+
2894+
# reload sequence
2895+
cdef bam1_t *b = self.plp[0].b
2896+
if self.fastafile != NULL and self.tid != b.core.tid:
2897+
if self.seq != NULL:
2898+
free(self.seq)
2899+
self.tid = b.core.tid
2900+
tid = self.tid
2901+
assert self.header is not None
2902+
with nogil:
2903+
self.seq = faidx_fetch_seq64(
2904+
self.fastafile,
2905+
self.header.ptr.target_name[tid],
2906+
0, MAX_POS,
2907+
&self.seq_len)
2908+
2909+
if self.seq == NULL:
2910+
raise ValueError(
2911+
"reference sequence for '{}' (tid={}) not found".format(
2912+
self.header.target_name[self.tid], self.tid))
2913+
2914+
return makePileupColumn(&self.plp,
2915+
self.tid,
2916+
self.pos,
2917+
self.n_plp,
2918+
self.min_base_quality,
2919+
self.seq,
2920+
self.header)
2921+
2922+
cdef char * get_sequence(self):
2923+
'''return current reference sequence underlying the iterator.
2924+
'''
2925+
return self.seq
2926+
2927+
property seq_len:
2928+
'''current sequence length.'''
2929+
def __get__(self):
2930+
return self.seq_len
2931+
2932+
def add_reference(self, FastaFile fastafile):
2933+
'''add reference sequences in `fastafile` to iterator.'''
2934+
self.fastafile = fastafile.fastafile
2935+
if self.seq != NULL:
2936+
free(self.seq)
2937+
self.tid = -1
2938+
2939+
2940+
def has_reference(self):
2941+
'''
2942+
return true if iterator is associated with a reference'''
2943+
return self.fastafile != NULL
2944+
2945+
27862946
cdef class SNPCall:
27872947
'''the results of a SNP call.'''
27882948
cdef int _tid

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
Cython>=3,<4
2+
parameterized>=0.9.0,<1

0 commit comments

Comments
 (0)