Skip to content

Commit

Permalink
rewrite scoretrack.py
Browse files Browse the repository at this point in the history
  • Loading branch information
taoliu committed Oct 18, 2024
1 parent 086aad0 commit f7a8cb7
Show file tree
Hide file tree
Showing 8 changed files with 1,697 additions and 109 deletions.
157 changes: 155 additions & 2 deletions MACS3/IO/Parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# cython: language_level=3
# cython: profile=True
# cython: linetrace=True
# Time-stamp: <2024-10-07 16:08:43 Tao Liu>
# Time-stamp: <2024-10-16 00:09:32 Tao Liu>

"""Module for all MACS Parser classes for input. Please note that the
parsers are for reading the alignment files ONLY.
Expand All @@ -28,7 +28,7 @@

from MACS3.Utilities.Constants import READ_BUFFER_SIZE
from MACS3.Signal.FixWidthTrack import FWTrack
from MACS3.Signal.PairedEndTrack import PETrackI
from MACS3.Signal.PairedEndTrack import PETrackI, PETrackII
from MACS3.Utilities.Logger import logging

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1472,3 +1472,156 @@ def fw_parse_line(self, thisline: bytes) -> tuple:
1)
else:
raise StrandFormatError(thisline, thisfields[1])


@cython.cclass
class FragParser(GenericParser):
"""Parser for Fragment file containing scATAC-seq information.
Format:
chromosome frag_leftend frag_rightend barcode count
Note: Only the first five columns are used!
"""
n = cython.declare(cython.int, visibility='public')
d = cython.declare(cython.float, visibility='public')

@cython.cfunc
def skip_first_commentlines(self):
"""BEDPEParser needs to skip the first several comment lines.
"""
l_line: cython.int
thisline: bytes

for thisline in self.fhd:
l_line = len(thisline)
if thisline and (thisline[:5] != b"track") \
and (thisline[:7] != b"browser") \
and (thisline[0] != 35): # 35 is b"#"
break

# rewind from SEEK_CUR
self.fhd.seek(-l_line, 1)
return

@cython.cfunc
def pe_parse_line(self, thisline: bytes):
"""Parse each line, and return chromosome, left and right
positions, barcode and count.
"""
thisfields: list

thisline = thisline.rstrip()

# still only support tabular as delimiter.
thisfields = thisline.split(b'\t')
try:
return (thisfields[0],
atoi(thisfields[1]),
atoi(thisfields[2]),
thisfields[3],
atoi(thisfields[4]))
except IndexError:
raise Exception("Less than 5 columns found at this line: %s\n" % thisline)

@cython.ccall
def build_petrack2(self):
"""Build PETrackII from all lines.
"""
chromosome: bytes
left_pos: cython.int
right_pos: cython.int
barcode: bytes
count: cython.uchar
i: cython.long = 0 # number of fragments
m: cython.long = 0 # sum of fragment lengths
tmp: bytes = b""

petrack = PETrackII(buffer_size=self.buffer_size)
add_loc = petrack.add_loc

while True:
# for each block of input
tmp += self.fhd.read(READ_BUFFER_SIZE)
if not tmp:
break
lines = tmp.split(b"\n")
tmp = lines[-1]
for thisline in lines[:-1]:
(chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
if left_pos < 0 or not chromosome:
continue
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
m += right_pos - left_pos
i += 1
if i % 1000000 == 0:
info(" %d fragments parsed" % i)
add_loc(chromosome, left_pos, right_pos, barcode, count)
# last one
if tmp:
(chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
if left_pos >= 0 and chromosome:
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
i += 1
m += right_pos - left_pos
add_loc(chromosome, left_pos, right_pos, barcode, count)

self.d = cython.cast(cython.float, m) / i
self.n = i
assert self.d >= 0, "Something went wrong (mean fragment size was negative)"

self.close()
petrack.set_rlengths({"DUMMYCHROM": 0})
return petrack

@cython.ccall
def append_petrack(self, petrack):
"""Build PETrackI from all lines, return a PETrackI object.
"""
chromosome: bytes
left_pos: cython.int
right_pos: cython.int
barcode: bytes
count: cython.uchar
i: cython.long = 0 # number of fragments
m: cython.long = 0 # sum of fragment lengths
tmp: bytes = b""

add_loc = petrack.add_loc
while True:
# for each block of input
tmp += self.fhd.read(READ_BUFFER_SIZE)
if not tmp:
break
lines = tmp.split(b"\n")
tmp = lines[-1]
for thisline in lines[:-1]:
(chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
if left_pos < 0 or not chromosome:
continue
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
m += right_pos - left_pos
i += 1
if i % 1000000 == 0:
info(" %d fragments parsed" % i)
add_loc(chromosome, left_pos, right_pos, barcode, count)
# last one
if tmp:
(chromosome, left_pos, right_pos, barcode, count) = self.pe_parse_line(thisline)
if left_pos >= 0 and chromosome:
assert right_pos > left_pos, "Right position must be larger than left position, check your BED file at line: %s" % thisline
i += 1
m += right_pos - left_pos
add_loc(chromosome, left_pos, right_pos, barcode, count)

self.d = (self.d * self.n + m) / (self.n + i)
self.n += i

assert self.d >= 0, "Something went wrong (mean fragment size was negative)"
self.close()
petrack.set_rlengths({"DUMMYCHROM": 0})
return petrack
20 changes: 19 additions & 1 deletion MACS3/IO/PeakIO.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# cython: language_level=3
# cython: profile=True
# Time-stamp: <2024-10-11 11:13:11 Tao Liu>
# Time-stamp: <2024-10-15 11:48:33 Tao Liu>

"""Module for PeakIO IO classes.
Expand Down Expand Up @@ -141,6 +141,24 @@ def __str__(self):
self.end,
self.score)

def __getstate__(self):
return (self.chrom,
self.start,
self.end,
self.length,
self.summit,
self.score,
self.pileup,
self.pscore,
self.fc,
self.qscore,
self.name)

def __setstate__(self, state):
(self.chrom, self.start, self.end, self.length, self.summit,
self.score, self.pileup, self.pscore, self.fc,
self.qscore, self.name) = state


@cython.cclass
class PeakIO:
Expand Down
Loading

0 comments on commit f7a8cb7

Please sign in to comment.