Skip to content

Commit

Permalink
Merge pull request #272 from BuysDB/DamID
Browse files Browse the repository at this point in the history
DamID update
  • Loading branch information
BuysDB authored Oct 14, 2024
2 parents 951a106 + 9be66f5 commit 222c476
Show file tree
Hide file tree
Showing 27 changed files with 3,226 additions and 79 deletions.
23 changes: 8 additions & 15 deletions .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
@@ -1,31 +1,24 @@
name: Python application

on: [push]
on: [push, pull_request]

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v1
- name: Set up Python 3.7
uses: actions/setup-python@v1
- uses: actions/checkout@v4
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: 3.7
python-version: 3.12
- name: Install dependencies
run: |
python -m pip install --upgrade pip
# python -m pip install --upgrade pip
pip install pytest
pip install .
pip install -r requirements.txt
- name: Lint with flake8
run: |
pip install flake8
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pip install pytest
pip install .
pytest
3 changes: 3 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@
'singlecellmultiomics/fastaProcessing/createMappabilityIndex.py',
'singlecellmultiomics/fastaProcessing/fastaCreateDict.py',

# Fastq
'singlecellmultiomics/fastqProcessing/fastq_filter_by_dt.py',

# Tagging
'singlecellmultiomics/universalBamTagger/universalBamTagger.py',
'singlecellmultiomics/universalBamTagger/bamtagmultiome_multi.py',
Expand Down
66 changes: 46 additions & 20 deletions singlecellmultiomics/bamProcessing/bamBinCounts.py
Original file line number Diff line number Diff line change
Expand Up @@ -1007,13 +1007,23 @@ def count_fragments_binned(args):
continue

# Extract the site
site = int(read.get_tag('DS'))

try:
site = int(read.get_tag('DS'))
except KeyError:
if read.reference_start is not None:
site = read.reference_start
elif read.reference_end is not None:
site = read.reference_end
else:
continue
# Don't count sites outside the selected bounds
if site < start or site >= end:
continue

sample = read.get_tag('SM')

try:
sample = read.get_tag('SM')
except KeyError:
sample = 'bulk'

# Process alternative contig counts:
if alt_spans is not None and contig in alt_spans:
Expand Down Expand Up @@ -1064,33 +1074,47 @@ def count_fixed_width_binned_regions(path:str,
contig_starts:int,
smap:dict,
min_mapq: int = 1,
identifier=None):
identifier=None,
read_pass_function=None # Function which is called for every read, when True, the read is taken into account needs (DS) tag
):
"""
Count using fixed width regions with bin_size
The header is precomputed
"""
counts = np.zeros((len(smap),n_bins))
with pysam.AlignmentFile(path) as a:

for read in a.fetch(contig):
if not read.is_read1 or read.is_qcfail or read.is_duplicate:
continue
if not read.has_tag('DS'):
continue
if read.mapping_quality<min_mapq:
continue
ds = read.get_tag('DS')
#bi = int(read.get_tag('bi'))
bi = smap[read.get_tag('SM')]
bx = int(ds/bin_size) + contig_starts[contig]
counts[bi,bx] +=1

# Standard filter loop
if read_pass_function is not None:
for read in a.fetch(contig):
if not read_pass_function(read):
continue
ds = read.get_tag('DS')
#bi = int(read.get_tag('bi'))
bi = smap[read.get_tag('SM')]
bx = int(ds/bin_size) + contig_starts[contig]
counts[bi,bx] +=1
else:
for read in a.fetch(contig):
if read.is_read2 or read.is_qcfail or read.is_duplicate:
continue
if not read.has_tag('DS'):
continue
if read.mapping_quality<min_mapq:
continue
ds = read.get_tag('DS')
#bi = int(read.get_tag('bi'))
bi = smap[read.get_tag('SM')]
bx = int(ds/bin_size) + contig_starts[contig]
counts[bi,bx] +=1

if identifier is not None:
return counts,identifier
else:
return counts


def count_multi_sample(patientsToBam, bin_size, n_threads=None, exclude_contigs=('MT',), verbose=False, include_contigs=None):
def count_multi_sample(patientsToBam, bin_size, n_threads=None, exclude_contigs=('MT',), verbose=False, include_contigs=None, read_pass_function=None):

cmds = []
headers = dict()
Expand Down Expand Up @@ -1123,6 +1147,7 @@ def count_multi_sample(patientsToBam, bin_size, n_threads=None, exclude_contigs=
'bin_size':bin_size,
'contig':contig,
'smap':smap,
'read_pass_function':read_pass_function,
'identifier':patient
}) for contig in cs.keys()
]
Expand All @@ -1133,10 +1158,11 @@ def count_multi_sample(patientsToBam, bin_size, n_threads=None, exclude_contigs=
print("Counting")
with Pool(n_threads) as workers:

for i,(r,identifier) in enumerate(workers.map(pool_wrapper, cmds)):
for i,(r,identifier) in enumerate(workers.imap(pool_wrapper, cmds, chunksize=1)):
countsDict[identifier] += r
if verbose:
print(f"Progress: {(i/len(cmds))*100:.2f}% ", end='\r')

if verbose:
print("Finished counting, Now creating dataframes")
for identifier in countsDict:
Expand Down
4 changes: 2 additions & 2 deletions singlecellmultiomics/bamProcessing/bamFunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def get_read_group_format(bam):

if len(format_type_obs)==1:
return format_type_obs.most_common(1)[0][0]
raise ValueError('Failed to indentify read group format ')
raise ValueError('Failed to identify read group format ')


def _get_sample_to_read_group_dict(handle):
Expand Down Expand Up @@ -1012,7 +1012,7 @@ def replace_bam_header(origin_bam_path, header, target_bam_path=None, header_wri
pass

# Concatenate and remove origin
rehead_cmd = f"""{{ cat '{headerSamFilePath}'; samtools view '{origin_bam_path}'; }} | samtools view -b > '{complete_temp_path}' &&
rehead_cmd = f"""{{ cat '{headerSamFilePath}'; samtools view -@4 '{origin_bam_path}'; }} | samtools view -b -@4> '{complete_temp_path}' &&
mv '{complete_temp_path}' '{target_bam_path}' && rm '{headerSamFilePath}';
"""
assert (os.system(rehead_cmd)==0) and os.path.exists(target_bam_path), f'Reheading failed. Command: {rehead_cmd} '
Expand Down
2 changes: 1 addition & 1 deletion singlecellmultiomics/fastqProcessing/fastqHandle.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def __init__(
'R2.fastq.gz',
'wt',compresslevel=1)]
else:
self.handles = [gzip.open(path + 'reads.fastq.gz', 'wt')]
self.handles = [gzip.open(path + 'R1.fastq.gz', 'wt',compresslevel=1)]
else:

self.handles = HandleLimiter(
Expand Down
73 changes: 73 additions & 0 deletions singlecellmultiomics/fastqProcessing/fastq_filter_by_dt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python3
import argparse
import gzip
from more_itertools import chunked

def select_datatype_fastq_pe(r1path, r2path, r1path_out, r2path_out, dt):
look_for = f'dt:{dt}'
with gzip.open(r1path,'rt') as r1h,\
gzip.open(r2path,'rt') as r2h,\
gzip.open(r1path_out,'wt') as r1ho,\
gzip.open(r2path_out,'wt') as r2ho:

for r1,r2 in zip(chunked(r1h,4), chunked(r2h,4)):
if look_for in r1[0]:
r1ho.write(''.join(r1))
r2ho.write(''.join(r2))

def select_datatype_fastq_se(r1path, r1path_out, dt):
look_for = f'dt:{dt}'
with gzip.open(r1path,'rt') as r1h,\
gzip.open(r1path_out,'wt') as r1ho:

for r1 in chunked(r1h,4):
if look_for in r1[0]:
r1ho.write(''.join(r1))



def run():
argparser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="Split fastq file based on dt: tag"
)

argparser.add_argument(
'-r1_in',
type=str,required=True
)
argparser.add_argument(
'-r2_in',
type=str,required=False
)
argparser.add_argument(
'-r1_out',help='R1 output',
type=str,required=True
)
argparser.add_argument(
'-r2_out',help='R2 output',
type=str,required=False
)

argparser.add_argument(
'-dt',
help="Datatype, for example RNA or DamID",
type=str,required=True
)


args = argparser.parse_args()
if args.r2_in is not None:
select_datatype_fastq_pe(args.r1_in,
args.r2_in,
args.r1_out,
args.r2_out,
args.dt)
else:
select_datatype_fastq_se(args.r1_in,
args.r1_out,
args.dt)
print('Done')

if __name__ == '__main__':
run()
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def detect(self, filesToList, replace=None, slib=None, merge=None, se=False, ign
# Check if we are dealing with a raw illumina or SRR fastq file:

if fastqFileName.endswith('_R1') or fastqFileName.endswith('_R2'):
lane = '0' # Create "fake" lane
lane = 'single_file' # Create "fake" lane
library = self.libraryReplace(
fastqFileName.rsplit('_R', 1)[0], replace)
r1ORr2 = fastqFileName.rsplit('_', 1)[-1]
Expand All @@ -111,7 +111,7 @@ def detect(self, filesToList, replace=None, slib=None, merge=None, se=False, ign
#print(path, library, r1ORr2, self.libraries)

elif fastqFileName.endswith('R1') or fastqFileName.endswith('R2'):
lane = '0' # Create "fake" lane
lane = 'single_file' # Create "fake" lane
library = self.libraryReplace(
fastqFileName.rsplit('R', 1)[0], replace)
r1ORr2 = 'R' + fastqFileName[-1]
Expand All @@ -138,7 +138,7 @@ def detect(self, filesToList, replace=None, slib=None, merge=None, se=False, ign
lane = library
library = slib
else:
lane = '0'
lane = 'single_file'

if library not in self.libraries:
self.libraries[library] = {lane: {}}
Expand Down
Loading

0 comments on commit 222c476

Please sign in to comment.