From 108b2d59f9a261bdcc05f0cdf79c1841c38cc6f5 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Sun, 27 Apr 2025 15:05:12 +0200 Subject: [PATCH 1/2] Add `AlignmentFlag.filter` for iterating on reads without certain flags set --- pysam/libcalignmentfile.pxd | 6 +++++ pysam/libcalignmentfile.pyi | 6 +++++ pysam/libcalignmentfile.pyx | 53 +++++++++++++++++++++++++++++++++++++ tests/AlignmentFile_test.py | 34 +++++++++++++++++++++++- 4 files changed, 98 insertions(+), 1 deletion(-) diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd index cd0ebf83..33eeeee6 100644 --- a/pysam/libcalignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -99,6 +99,12 @@ cdef class IteratorRowSelection(IteratorRow): cdef int cnext(self) +cdef class IteratorRowFilter(IteratorRow): + cdef int flag_filter + cdef bam1_t * getCurrent(self) + cdef int cnext(self) + + cdef class IteratorColumn: # result of the last plbuf_push diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi index 6f106af9..ce9a3b21 100644 --- a/pysam/libcalignmentfile.pyi +++ b/pysam/libcalignmentfile.pyi @@ -165,6 +165,11 @@ class AlignmentFile(HTSFile): def find_introns( self, read_iterator: Iterable[AlignedSegment] ) -> Dict[Tuple[int, int], int]: ... + def filter( + self, + flag_filter: int = ..., + multiple_iterators: bool = ..., + ) -> IteratorRowFilter: ... def close(self) -> None: ... def write(self, read: AlignedSegment) -> int: ... def __enter__(self) -> AlignmentFile: ... @@ -202,6 +207,7 @@ class IteratorRowAllRefs(IteratorRow): ... class IteratorRowHead(IteratorRow): ... class IteratorRowRegion(IteratorRow): ... class IteratorRowSelection(IteratorRow): ... +class IteratorRowFilter(IteratorRow): ... class IteratorColumn: def __iter__(self) -> IteratorColumn: ... diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index dc8cadd0..6f62ff17 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -1663,6 +1663,10 @@ cdef class AlignmentFile(HTSFile): res[(junc_start, base_position)] += 1 return res + def filter(self, **kwargs): + if not self.is_open: + raise ValueError("I/O operation on closed file") + return IteratorRowFilter(self, **kwargs) def close(self): '''closes the :class:`pysam.AlignmentFile`.''' @@ -2323,6 +2327,55 @@ cdef class IteratorRowSelection(IteratorRow): raise IOError(read_failure_reason(ret)) +cdef class IteratorRowFilter(IteratorRow): + """*(AlignmentFile samfile, int flag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, bool multiple_iterators=False)* + + iterates over reads that have none of the given flags set. + + .. note:: + It is usually not necessary to create an object of this class + explicitly. It is returned as a result of call to a :meth:`AlignmentFile.filter`. + """ + def __init__( + self, + AlignmentFile samfile, + int flag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, + bint multiple_iterators=True + ): + super().__init__(samfile, multiple_iterators=multiple_iterators) + + self.flag_filter = flag_filter + + def __iter__(self): + return self + + cdef bam1_t* getCurrent(self): + return self.b + + cdef int cnext(self): + '''cversion of iterator. Used by IteratorColumn''' + cdef int ret + cdef bam_hdr_t * hdr = self.header.ptr + with nogil: + while 1: + ret = sam_read1(self.htsfile, hdr, self.b) + if ret < 0: + break + if self.b.core.flag & self.flag_filter: + continue + break + return ret + + def __next__(self): + cdef int ret = self.cnext() + if ret >= 0: + return makeAlignedSegment(self.b, self.header) + elif ret == -1: + raise StopIteration + else: + raise IOError(read_failure_reason(ret)) + + cdef int __advance_nofilter(void *data, bam1_t *b): '''advance without any read filtering. ''' diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 0d599df3..3c309f95 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -2411,7 +2411,39 @@ def test_reading_writing_cram(self): return read = self.build_read() self.check_read(read, mode="cram") - + + +class TestAlignmentFileFilter(unittest.TestCase): + + filename = os.path.join(BAM_DATADIR, 'ex1.bam') + mode = "rb" + + def testNoFilter(self): + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + total = 0 + for read in samfile1: + total += 1 + with pysam.AlignmentFile(self.filename, self.mode) as samfile2: + n2 = 0 + for read in samfile1.filter(0): + n2 += 1 + self.assertEqual(total, n2) + + def testMapped(self): + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + n1 = 0 + total = 0 + for read in samfile1: + n1 += read.is_mapped + total += 1 + self.assertLowerEqual(n1, total) + with pysam.AlignmentFile(self.filename, self.mode) as samfile2: + n2 = 0 + for read in samfile1.filter(pysam.BAM_FUNMAP): + n2 += 1 + self.assertEqual(n1, n2) + + # SAM writing fails, as query length is 0 # class TestSanityCheckingSAM(TestSanityCheckingSAM): # mode = "w" From 49aa5b04fb7986fe1ccbba4454728e6711ac26e2 Mon Sep 17 00:00:00 2001 From: Martin Larralde Date: Sun, 27 Apr 2025 15:31:38 +0200 Subject: [PATCH 2/2] Fix tests in `TestAlignmentFileFilter` --- pysam/libcalignmentfile.pxd | 1 + pysam/libcalignmentfile.pyi | 1 + pysam/libcalignmentfile.pyx | 26 +++++++++++++++++++++++++- tests/AlignmentFile_test.py | 19 ++++++++++++++++--- 4 files changed, 43 insertions(+), 4 deletions(-) diff --git a/pysam/libcalignmentfile.pxd b/pysam/libcalignmentfile.pxd index 33eeeee6..b59cbdef 100644 --- a/pysam/libcalignmentfile.pxd +++ b/pysam/libcalignmentfile.pxd @@ -101,6 +101,7 @@ cdef class IteratorRowSelection(IteratorRow): cdef class IteratorRowFilter(IteratorRow): cdef int flag_filter + cdef int flag_require cdef bam1_t * getCurrent(self) cdef int cnext(self) diff --git a/pysam/libcalignmentfile.pyi b/pysam/libcalignmentfile.pyi index ce9a3b21..228f4f98 100644 --- a/pysam/libcalignmentfile.pyi +++ b/pysam/libcalignmentfile.pyi @@ -168,6 +168,7 @@ class AlignmentFile(HTSFile): def filter( self, flag_filter: int = ..., + flag_require: int = ..., multiple_iterators: bool = ..., ) -> IteratorRowFilter: ... def close(self) -> None: ... diff --git a/pysam/libcalignmentfile.pyx b/pysam/libcalignmentfile.pyx index 6f62ff17..61eae02d 100644 --- a/pysam/libcalignmentfile.pyx +++ b/pysam/libcalignmentfile.pyx @@ -1664,6 +1664,26 @@ cdef class AlignmentFile(HTSFile): return res def filter(self, **kwargs): + """iterate and filter segments in the file. + + Parameters + ---------- + + flag_filter : int + + ignore reads where any of the bits in the flag are set. The default + is BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP. + + flag_require : int + + only use reads where certain flags are set. The default is 0. + + Returns + ------- + + an iterator over filtered rows. : IteratorRowFilter + + """ if not self.is_open: raise ValueError("I/O operation on closed file") return IteratorRowFilter(self, **kwargs) @@ -2328,7 +2348,7 @@ cdef class IteratorRowSelection(IteratorRow): cdef class IteratorRowFilter(IteratorRow): - """*(AlignmentFile samfile, int flag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, bool multiple_iterators=False)* + """*(AlignmentFile samfile, int flag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, int flag_require = 0, bool multiple_iterators=False)* iterates over reads that have none of the given flags set. @@ -2340,11 +2360,13 @@ cdef class IteratorRowFilter(IteratorRow): self, AlignmentFile samfile, int flag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP, + int flag_require = 0, bint multiple_iterators=True ): super().__init__(samfile, multiple_iterators=multiple_iterators) self.flag_filter = flag_filter + self.flag_require = flag_require def __iter__(self): return self @@ -2363,6 +2385,8 @@ cdef class IteratorRowFilter(IteratorRow): break if self.b.core.flag & self.flag_filter: continue + elif self.flag_require and not (self.b.core.flag & self.flag_require): + continue break return ret diff --git a/tests/AlignmentFile_test.py b/tests/AlignmentFile_test.py index 3c309f95..dc5340ba 100644 --- a/tests/AlignmentFile_test.py +++ b/tests/AlignmentFile_test.py @@ -2425,7 +2425,7 @@ def testNoFilter(self): total += 1 with pysam.AlignmentFile(self.filename, self.mode) as samfile2: n2 = 0 - for read in samfile1.filter(0): + for read in samfile2.filter(flag_filter=0): n2 += 1 self.assertEqual(total, n2) @@ -2436,13 +2436,26 @@ def testMapped(self): for read in samfile1: n1 += read.is_mapped total += 1 - self.assertLowerEqual(n1, total) + self.assertLessEqual(n1, total) with pysam.AlignmentFile(self.filename, self.mode) as samfile2: n2 = 0 - for read in samfile1.filter(pysam.BAM_FUNMAP): + for read in samfile2.filter(flag_filter=pysam.FUNMAP): n2 += 1 self.assertEqual(n1, n2) + def testUnmapped(self): + with pysam.AlignmentFile(self.filename, self.mode) as samfile1: + n1 = 0 + total = 0 + for read in samfile1: + n1 += not read.is_mapped + total += 1 + self.assertLessEqual(n1, total) + with pysam.AlignmentFile(self.filename, self.mode) as samfile2: + n2 = 0 + for read in samfile2.filter(flag_filter=0, flag_require=pysam.FUNMAP): + n2 += 1 + self.assertEqual(n1, n2) # SAM writing fails, as query length is 0 # class TestSanityCheckingSAM(TestSanityCheckingSAM):