Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .cirrus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ freebsd_ci_task:

install_script: |
pkg install -y bcftools gmake py311-cython3 py311-mypy py311-pytest samtools
pip install parameterized

env:
CC: "clang -isystem /usr/local/include"
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Install prerequisite Python libraries
run: pip install cython mypy pytest setuptools
run: pip install cython mypy pytest setuptools parameterized

- name: Install Linux build prerequisites
if: runner.os == 'Linux'
Expand Down Expand Up @@ -79,7 +79,7 @@ jobs:
python-version: ${{ matrix.python-version }}

- name: Install prerequisite Python libraries
run: pip install cython pytest
run: pip install cython pytest parameterized

- name: Install build prerequisites
if: runner.os == 'Linux'
Expand Down Expand Up @@ -147,7 +147,7 @@ jobs:
run: python setup.py install

- name: Install test prerequisites via Conda
run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest
run: conda install "samtools>=1.11" "bcftools>=1.11" "htslib>=1.11" pytest parameterized

- name: Run tests
run: REF_PATH=':' pytest
15 changes: 14 additions & 1 deletion pysam/libcalignmentfile.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ cdef class IteratorColumn:
# backwards compatibility
cdef char * getSequence(self)


cdef class IteratorColumnRegion(IteratorColumn):
cdef int start
cdef int stop
Expand All @@ -144,6 +143,20 @@ cdef class IteratorColumnAllRefs(IteratorColumn):
cdef class IteratorColumnAll(IteratorColumn):
pass

cdef class IteratorColumnRecords:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this cannot extend IteratorColumn since IteratorColumn requires a SAM file.

cdef int cnext(self)
cdef bam_plp_t plp_iter
cdef int tid
cdef hts_pos_t pos
cdef int n_plp
cdef uint32_t min_base_quality
cdef const bam_pileup1_t * plp
cdef AlignmentHeader header
cdef char * seq
cdef hts_pos_t seq_len
cdef faidx_t * fastafile
cdef char * get_sequence(self)


cdef class IndexedReads:
cdef AlignmentFile samfile
Expand Down
7 changes: 7 additions & 0 deletions pysam/libcalignmentfile.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,13 @@ class IteratorColumn:
class IteratorColumnAll(IteratorColumn): ...
class IteratorColumnAllRefs(IteratorColumn): ...
class IteratorColumnRegion(IteratorColumn): ...
class IteratorColumnRecords():
def __iter__(self) -> IteratorColumn: ...
def __next__(self) -> PileupColumn: ...
@property
def seq_len(self) -> int: ...
def add_reference(self, fastafile: FastaFile) -> None: ...
def has_reference(self) -> bool: ...

class SNPCall:
@property
Expand Down
162 changes: 161 additions & 1 deletion pysam/libcalignmentfile.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
# class IteratorColumnRegion
# class IteratorColumnAll
# class IteratorColumnAllRefs
# class IteratorColumnRecords
#
########################################################
#
Expand Down Expand Up @@ -57,6 +58,8 @@
########################################################
import os
import collections
from typing import Iterable, Optional

try:
from collections.abc import Sequence, Mapping # noqa
except ImportError:
Expand All @@ -74,7 +77,7 @@ from pysam.libcutils cimport force_bytes, force_str, charptr_to_str
from pysam.libcutils cimport OSError_from_errno, encode_filename, from_string_and_size
from pysam.libcalignedsegment cimport makeAlignedSegment, makePileupColumn
from pysam.libchtslib cimport HTSFile, hisremote, sam_index_load2, sam_index_load3, \
HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL
HTS_IDX_SAVE_REMOTE, HTS_IDX_SILENT_FAIL, hts_pos_t

from io import StringIO

Expand All @@ -86,6 +89,7 @@ __all__ = [
"AlignmentHeader",
"IteratorRow",
"IteratorColumn",
"IteratorColumnRecords",
"IndexedReads"]

IndexStats = collections.namedtuple("IndexStats",
Expand Down Expand Up @@ -2783,6 +2787,162 @@ cdef class IteratorColumnAll(IteratorColumn):
self.samfile.header)



cdef class IteratorColumnRecords:
'''Iterator over columns when given a collection of :class:`~pysam.AlignedSegment`s.

For reasons of efficiency, the iterator requires the given
:class:`~pysam.AlignedSegment`s to be in coordinate sorted order.
For implementation simplicity, all the records will be consumed
from the given iterator.

For example:

f = AlignmentFile("file.bam", "rb")
result = list(IteratorColumnRecords([rec for rec in f]))

Here, `result`` will be a list of ``n`` lists of objects of type
:class:`~pysam.PileupRead`.

If the iterator is associated with a :class:`~pysam.Fastafile`
using the :meth:`add_reference` method, then the iterator will
export the current sequence via the methods :meth:`get_sequence`
and :meth:`seq_len`.

See :class:`~AlignmentFile.pileup` for kwargs to the iterator.

.. note::

**Filtering Behavior:** This iterator uses a push-based approach where
records are added via ``bam_plp_push()``. This differs from the standard
:meth:`AlignmentFile.pileup` which uses a pull-based callback approach
with htslib's internal filtering pipeline.

If you manually filter records before passing them to this iterator
(e.g., filtering by flags, mapping quality, etc.), the resulting pileup
may differ slightly from equivalent ``samtools mpileup`` output, even
with identical filtering criteria. This is because the standard pileup
pipeline performs additional processing during iteration, including:

- Base Alignment Quality (BAQ) computation
- Mapping quality adjustments
- Internal overlap handling
- Filter application timing differences

For exact ``samtools mpileup`` compatibility, use
:meth:`AlignmentFile.pileup` with the ``stepper="samtools"`` option
instead of manually filtering and using this iterator.

This iterator is best suited for cases where you need to:

- Build pileups from records already in memory
- Apply custom filtering logic not available in standard pileup
- Process records from multiple sources before pileup generation

'''

def __cinit__(self, recs: Iterable[AlignedSegment], **kwargs):
cdef FastaFile fastafile = kwargs.get("fastafile", None)
if fastafile is None:
self.fastafile = NULL
else:
self.fastafile = fastafile.fastafile
self.min_base_quality = kwargs.get("min_base_quality", 13)
self.plp_iter = <bam_plp_t>bam_plp_init(NULL, NULL)
rec: AlignedSegment
self.header: Optional[AlignmentHeader] = None
for rec in recs:
if self.header is None:
self.header = rec.header
if bam_plp_push(self.plp_iter, rec._delegate) != 0:
raise Exception("Could not add record to the iterator: {}".format(str(rec)))
# Signal end of input
if bam_plp_push(self.plp_iter, NULL) != 0:
raise Exception("Could not finalize the iterator")

def __dealloc__(self):
bam_plp_destroy(self.plp_iter)
self.plp_iter = <bam_plp_t>NULL
if self.seq != NULL:
free(self.seq)
self.seq = NULL

def __iter__(self):
return self

cdef int cnext(self):
'''perform next iteration.
'''
self.plp = <bam_pileup1_t*>bam_plp64_next(
self.plp_iter,
&self.tid,
&self.pos,
&self.n_plp
)
if self.plp == NULL:
return 0
else:
return 1

def __next__(self):
cdef int n
cdef int tid
n = self.cnext()
if n == 0:
raise StopIteration

# reload sequence
cdef bam1_t *b = self.plp[0].b
if self.fastafile != NULL and self.tid != b.core.tid:
if self.seq != NULL:
free(self.seq)
self.tid = b.core.tid
tid = self.tid
assert self.header is not None
with nogil:
self.seq = faidx_fetch_seq64(
self.fastafile,
self.header.ptr.target_name[tid],
0, MAX_POS,
&self.seq_len)

if self.seq == NULL:
raise ValueError(
"reference sequence for '{}' (tid={}) not found".format(
self.header.target_name[self.tid], self.tid))

return makePileupColumn(&self.plp,
self.tid,
self.pos,
self.n_plp,
self.min_base_quality,
self.seq,
self.header)

cdef char * get_sequence(self):
'''return current reference sequence underlying the iterator.
'''
return self.seq

property seq_len:
'''current sequence length.'''
def __get__(self):
return self.seq_len

def add_reference(self, FastaFile fastafile):
'''add reference sequences in `fastafile` to iterator.'''
self.fastafile = fastafile.fastafile
if self.seq != NULL:
free(self.seq)
self.tid = -1


def has_reference(self):
'''
return true if iterator is associated with a reference'''
return self.fastafile != NULL


cdef class SNPCall:
'''the results of a SNP call.'''
cdef int _tid
Expand Down
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
Cython>=3,<4
parameterized>=0.9.0,<1
Loading
Loading