Merge pull request #272 from BuysDB/DamID

DamID update
BuysDB · Oct 14, 2024 · 222c476 · 222c476
2 parents 951a106 + 9be66f5
commit 222c476
Show file tree

Hide file tree

Showing 27 changed files with 3,226 additions and 79 deletions.
diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -1,31 +1,24 @@
 name: Python application
 
-on: [push]
+on: [push, pull_request]
 
 jobs:
   build:
 
     runs-on: ubuntu-latest
 
     steps:
-    - uses: actions/checkout@v1
-    - name: Set up Python 3.7
-      uses: actions/setup-python@v1
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v5
       with:
-        python-version: 3.7
+        python-version: 3.12
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
+        # python -m pip install --upgrade pip
+        pip install pytest
+        pip install .
         pip install -r requirements.txt
-    - name: Lint with flake8
-      run: |
-        pip install flake8
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        pip install pytest
-        pip install .
         pytest
diff --git a/setup.py b/setup.py
@@ -64,6 +64,9 @@
         'singlecellmultiomics/fastaProcessing/createMappabilityIndex.py',
         'singlecellmultiomics/fastaProcessing/fastaCreateDict.py',
 
+        # Fastq
+        'singlecellmultiomics/fastqProcessing/fastq_filter_by_dt.py',
+
         # Tagging
         'singlecellmultiomics/universalBamTagger/universalBamTagger.py',
         'singlecellmultiomics/universalBamTagger/bamtagmultiome_multi.py',

diff --git a/singlecellmultiomics/bamProcessing/bamBinCounts.py b/singlecellmultiomics/bamProcessing/bamBinCounts.py
@@ -1007,13 +1007,23 @@ def count_fragments_binned(args):
                 continue
 
             # Extract the site
-            site = int(read.get_tag('DS'))
-
+            try:
+                site = int(read.get_tag('DS'))
+            except KeyError:
+                if read.reference_start is not None:
+                    site = read.reference_start 
+                elif read.reference_end is not None:
+                    site = read.reference_end
+                else:
+                    continue
             # Don't count sites outside the selected bounds
             if site < start or site >= end:
                 continue
-
-            sample = read.get_tag('SM')
+
+            try:
+                sample = read.get_tag('SM')
+            except KeyError:
+                sample = 'bulk'
 
             # Process alternative contig counts:
             if alt_spans is not None and contig in alt_spans:
@@ -1064,33 +1074,47 @@ def count_fixed_width_binned_regions(path:str,
                                      contig_starts:int,
                                      smap:dict,
                                      min_mapq: int = 1,
-                                     identifier=None):
+                                     identifier=None,
+                                     read_pass_function=None # Function which is called for every read, when True, the read is taken into account needs (DS) tag
+                                     ):
     """
     Count using fixed width regions with bin_size
     The header is precomputed
     """
     counts = np.zeros((len(smap),n_bins))
     with pysam.AlignmentFile(path) as a:
-
-        for read in a.fetch(contig):
-            if not read.is_read1 or read.is_qcfail or read.is_duplicate:
-                continue
-            if not read.has_tag('DS'):
-                continue
-            if read.mapping_quality<min_mapq:
-                continue
-            ds = read.get_tag('DS')
-            #bi = int(read.get_tag('bi'))
-            bi = smap[read.get_tag('SM')]
-            bx = int(ds/bin_size) + contig_starts[contig]
-            counts[bi,bx] +=1
+
+        # Standard filter loop
+        if read_pass_function is not None:
+            for read in a.fetch(contig):
+                if not read_pass_function(read):
+                    continue
+                ds = read.get_tag('DS')
+                #bi = int(read.get_tag('bi'))
+                bi = smap[read.get_tag('SM')]
+                bx = int(ds/bin_size) + contig_starts[contig]
+                counts[bi,bx] +=1
+        else:
+            for read in a.fetch(contig):
+                if read.is_read2 or read.is_qcfail or read.is_duplicate:
+                    continue
+                if not read.has_tag('DS'):
+                    continue
+                if read.mapping_quality<min_mapq:
+                    continue
+                ds = read.get_tag('DS')
+                #bi = int(read.get_tag('bi'))
+                bi = smap[read.get_tag('SM')]
+                bx = int(ds/bin_size) + contig_starts[contig]
+                counts[bi,bx] +=1
+
     if identifier is not None:
         return counts,identifier
     else:
         return counts
 
 
-def count_multi_sample(patientsToBam, bin_size, n_threads=None, exclude_contigs=('MT',), verbose=False, include_contigs=None):
+def count_multi_sample(patientsToBam, bin_size, n_threads=None, exclude_contigs=('MT',), verbose=False, include_contigs=None, read_pass_function=None):
 
     cmds = []
     headers = dict()
@@ -1123,6 +1147,7 @@ def count_multi_sample(patientsToBam, bin_size, n_threads=None, exclude_contigs=
                 'bin_size':bin_size,
                 'contig':contig,
                 'smap':smap,
+                'read_pass_function':read_pass_function,
                 'identifier':patient
                           }) for contig in cs.keys()
         ]
@@ -1133,10 +1158,11 @@ def count_multi_sample(patientsToBam, bin_size, n_threads=None, exclude_contigs=
         print("Counting")
     with Pool(n_threads) as workers:
 
-        for i,(r,identifier) in enumerate(workers.map(pool_wrapper, cmds)):
+        for i,(r,identifier) in enumerate(workers.imap(pool_wrapper, cmds, chunksize=1)):
             countsDict[identifier] += r
             if verbose:
                 print(f"Progress: {(i/len(cmds))*100:.2f}%  ", end='\r')
+
     if verbose:
         print("Finished counting, Now creating dataframes")
     for identifier in countsDict:

diff --git a/singlecellmultiomics/bamProcessing/bamFunctions.py b/singlecellmultiomics/bamProcessing/bamFunctions.py
@@ -312,7 +312,7 @@ def get_read_group_format(bam):
 
         if len(format_type_obs)==1:
             return format_type_obs.most_common(1)[0][0]
-        raise ValueError('Failed to indentify read group format ')
+        raise ValueError('Failed to identify read group format ')
 
 
 def _get_sample_to_read_group_dict(handle):
@@ -1012,7 +1012,7 @@ def replace_bam_header(origin_bam_path, header, target_bam_path=None, header_wri
             pass
 
         # Concatenate and remove origin
-        rehead_cmd = f"""{{ cat '{headerSamFilePath}'; samtools view '{origin_bam_path}'; }} | samtools view -b > '{complete_temp_path}' &&
+        rehead_cmd = f"""{{ cat '{headerSamFilePath}'; samtools view -@4 '{origin_bam_path}'; }} | samtools view -b -@4> '{complete_temp_path}' &&
                 mv '{complete_temp_path}' '{target_bam_path}' && rm '{headerSamFilePath}';
                 """
         assert (os.system(rehead_cmd)==0) and os.path.exists(target_bam_path), f'Reheading failed. Command: {rehead_cmd} '

diff --git a/singlecellmultiomics/fastqProcessing/fastqHandle.py b/singlecellmultiomics/fastqProcessing/fastqHandle.py
@@ -26,7 +26,7 @@ def __init__(
                         'R2.fastq.gz',
                         'wt',compresslevel=1)]
             else:
-                self.handles = [gzip.open(path + 'reads.fastq.gz', 'wt')]
+                self.handles = [gzip.open(path + 'R1.fastq.gz', 'wt',compresslevel=1)]
         else:
 
             self.handles = HandleLimiter(

diff --git a/singlecellmultiomics/fastqProcessing/fastq_filter_by_dt.py b/singlecellmultiomics/fastqProcessing/fastq_filter_by_dt.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+import argparse
+import gzip
+from more_itertools import chunked
+
+def select_datatype_fastq_pe(r1path, r2path, r1path_out, r2path_out, dt):
+    look_for = f'dt:{dt}'
+    with gzip.open(r1path,'rt') as r1h,\
+        gzip.open(r2path,'rt') as r2h,\
+        gzip.open(r1path_out,'wt') as r1ho,\
+        gzip.open(r2path_out,'wt') as r2ho:
+
+        for r1,r2 in zip(chunked(r1h,4), chunked(r2h,4)):
+            if look_for in r1[0]:
+                r1ho.write(''.join(r1))
+                r2ho.write(''.join(r2))
+
+def select_datatype_fastq_se(r1path, r1path_out, dt):
+    look_for = f'dt:{dt}'
+    with gzip.open(r1path,'rt') as r1h,\
+        gzip.open(r1path_out,'wt') as r1ho:
+
+        for r1 in chunked(r1h,4):
+            if look_for in r1[0]:
+                r1ho.write(''.join(r1))
+
+
+
+def run():
+    argparser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Split fastq file based on dt: tag"
+    )
+
+    argparser.add_argument(
+        '-r1_in',
+        type=str,required=True
+        )
+    argparser.add_argument(
+        '-r2_in',
+        type=str,required=False
+        )
+    argparser.add_argument(
+        '-r1_out',help='R1 output',
+        type=str,required=True
+        )
+    argparser.add_argument(
+        '-r2_out',help='R2 output',
+        type=str,required=False
+        )
+
+    argparser.add_argument(
+        '-dt',
+        help="Datatype, for example RNA or DamID",
+        type=str,required=True
+        )
+
+
+    args = argparser.parse_args()
+    if args.r2_in is not None:
+        select_datatype_fastq_pe(args.r1_in, 
+                          args.r2_in, 
+                          args.r1_out, 
+                          args.r2_out, 
+                          args.dt)
+    else:
+         select_datatype_fastq_se(args.r1_in, 
+                          args.r1_out, 
+                          args.dt)
+    print('Done')
+
+if __name__ == '__main__':
+    run()
diff --git a/singlecellmultiomics/libraryDetection/sequencingLibraryListing.py b/singlecellmultiomics/libraryDetection/sequencingLibraryListing.py
@@ -93,7 +93,7 @@ def detect(self, filesToList, replace=None, slib=None, merge=None, se=False, ign
             # Check if we are dealing with a raw illumina or SRR fastq file:
 
             if fastqFileName.endswith('_R1') or fastqFileName.endswith('_R2'):
-                lane = '0'  # Create "fake" lane
+                lane = 'single_file'  # Create "fake" lane
                 library = self.libraryReplace(
                     fastqFileName.rsplit('_R', 1)[0], replace)
                 r1ORr2 = fastqFileName.rsplit('_', 1)[-1]
@@ -111,7 +111,7 @@ def detect(self, filesToList, replace=None, slib=None, merge=None, se=False, ign
                 #print(path, library, r1ORr2, self.libraries)
 
             elif fastqFileName.endswith('R1') or fastqFileName.endswith('R2'):
-                lane = '0'  # Create "fake" lane
+                lane = 'single_file'  # Create "fake" lane
                 library = self.libraryReplace(
                     fastqFileName.rsplit('R', 1)[0], replace)
                 r1ORr2 = 'R' + fastqFileName[-1]
@@ -138,7 +138,7 @@ def detect(self, filesToList, replace=None, slib=None, merge=None, se=False, ign
                     lane = library
                     library = slib
                 else:
-                    lane = '0'
+                    lane = 'single_file'
 
                 if library not in self.libraries:
                     self.libraries[library] = {lane: {}}