Add alt seq and cram support (#87)

* Modify SV ID nv_SV?-????? to NV.DEL.SV?-????? * Integrate SV support file generation * Change lead read Lead read selection now based on median SV size rather than largest SV size * Removed ins_size_mod * Create nv_alt_seq.py * Add alt_seq * Add ins_seq * Add ins strandness * Add alt_seq * bump v1.7.0 * Fix wkdir error * Fix alt seq var error * Fix chr2 bug * Fix ALT parsing * Fix main sv bug * Fix median size * Fix key error * Fix bed <0 * Fix bed <0 * Fix bed <0 * Fix bed <0 * Add cram * Add refpath for cram * Fix cram ref * Fix pickle * Update changelog
cytham · Jun 4, 2024 · 8a7c5cb · 8a7c5cb
1 parent 75b8bf5
commit 8a7c5cb
Show file tree

Hide file tree

Showing 10 changed files with 630 additions and 618 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -4,6 +4,13 @@ NanoVar Changelog
 Release Summary:
 
 
+Version 1.7.0 - Jun 4, 2024
+    * Added alternative sequence for INS and DEL in VCF
+    * Added CRAM input support
+    * Modified SV ID format with addition of SV type, from nv_SV?-????? to NV.INS.SV?-?????
+    * SV size now takes the median size amongst clustered reads, instead of the largest
+
+
 Version 1.6.2 - Apr 29, 2024
     * Fixed bug where some supplementary alignments do not have primary alignments, in the case of a subsample BAM input
 

diff --git a/nanovar/nanovar b/nanovar/nanovar
@@ -178,6 +178,7 @@ def main():
     filename = os.path.basename(file_path)
     read_suffix = ['.fa', '.fq', '.fasta', '.fastq', '.fa.gzip', '.fq.gzip', '.fa.gz', '.fq.gz', '.fasta.gz', '.fastq.gz']
     bam_suffix = '.bam'
+    cram_suffix = '.cram'
     contig_list = []
     if any(filename.lower().endswith(s) for s in read_suffix):
         input_name = os.path.basename(file_path).rsplit('.f', 1)[0]
@@ -210,9 +211,25 @@ def main():
         except AssertionError:
             logging.critical("Error: Input BAM file is not a BAM file.")
             raise Exception("Error: Input BAM file is not a BAM file.")
+    elif filename.lower().endswith(cram_suffix):
+        save = pysam.set_verbosity(0)  # Suppress index missing warning
+        sam = pysam.AlignmentFile(file_path, "rc", reference_filename=ref_path)
+        pysam.set_verbosity(save)  # Revert verbosity level
+        try:
+            assert sam.is_cram, "Error: Input CRAM file is not a CRAM file."
+            input_name = os.path.basename(file_path).rsplit('.cram', 1)[0]
+            input_type = 'cram'
+            fastx_check = []
+            # Get CRAM contigs from header
+            header = sam.header.to_dict()
+            for h in header['SQ']:
+                contig_list.append(h['SN'])
+        except AssertionError:
+            logging.critical("Error: Input CRAM file is not a CRAM file.")
+            raise Exception("Error: Input CRAM file is not a CRAM file.")
     else:
-        logging.critical("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam'")
-        raise Exception("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam'")
+        logging.critical("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam' or '.cram'")
+        raise Exception("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam' or '.cram'")
 
     if gzip_check(ref_path):
         logging.critical("Error: Input reference file is gzipped, please unzip it")
@@ -235,7 +252,7 @@ def main():
     logging.info('Number of threads: %s\n' % str(threads))
     if input_type == 'raw':
         logging.info('Total number of reads in FASTQ/FASTA: %s\n' % str(fastx_check[1]))
-    elif input_type == 'bam':
+    elif input_type == 'bam' or input_type == 'cram':
         logging.info('Total number of reads in FASTQ/FASTA: -\n')
     logging.info('NanoVar started')
 
@@ -248,22 +265,22 @@ def main():
         contig_len_dict[seq_record.id] = len(seq_record)
         total_gsize += len(seq_record)
 
-    # Check if contig sizes from BAM is same as hg38 if CNV mode
+    # Check if contig sizes from BAM/CRAM is same as hg38 if CNV mode
     if cnv:
         for chrom in hg38_size_dict:
             if chrom in contig_len_dict:
                 if contig_len_dict[chrom] != hg38_size_dict[chrom]:
-                    raise Exception("Error: Contig %s in BAM (%i) has different length in hg38 genome assembly (%i)" %
+                    raise Exception("Error: Contig %s in BAM/CRAM (%i) has different length in hg38 genome assembly (%i)" %
                                     (chrom, contig_len_dict[chrom], hg38_size_dict[chrom]))
             else:
-                raise Exception("Error: hg38 contig %s is absent in BAM" % chrom)
+                raise Exception("Error: hg38 contig %s is absent in BAM/CRAM" % chrom)
 
-    # Check BAM contigs in reference genome
-    if input_type == 'bam':
+    # Check BAM/CRAM contigs in reference genome
+    if input_type == 'bam' or input_type == 'cram':
         for c in contig_list:
             if c not in contig_len_dict:
-                logging.critical("Error: Contig %s in BAM is absent in reference genome" % c)
-                raise Exception("Error: Contig %s in BAM is absent in reference genome" % c)
+                logging.critical("Error: Contig %s in BAM/CRAM is absent in reference genome" % c)
+                raise Exception("Error: Contig %s in BAM/CRAM is absent in reference genome" % c)
 
     # Check contig id for invalid symbols
     contig_omit = checkcontignames(contig_len_dict)
@@ -326,9 +343,12 @@ def main():
         print(datetime.now().strftime("[%d/%m/%Y %H:%M:%S]"), '- Read mapping using minimap2 - ', end='', flush=True)
         mma = align_mm(ref_path, file_path, wk_dir, input_name, ref_name, threads_mm, mm, data_type, st)
         bam_path = mma[1]
+        save = pysam.set_verbosity(0)  # Suppress index missing warning
+        sam = pysam.AlignmentFile(bam_path, "rb")
+        pysam.set_verbosity(save)  # Revert verbosity level
         print('Done')
-    elif input_type == 'bam':
-        logging.info('Input BAM file, skipping minimap2 alignment')
+    elif input_type == 'bam' or input_type == 'cram':
+        logging.info('Input BAM/CRAM file, skipping minimap2 alignment')
         mma = ['-', '']
         bam_path = file_path
     # else:
@@ -354,7 +374,7 @@ def main():
     logging.info('Parsing BAM and detecting SVs')
     run = VariantDetect(wk_dir, bam_path, splitpct, minalign, filter_path, minlen, buff, model_path,
                         total_gsize, contig_len_dict, score_threshold, file_path, input_name, ref_path, ref_name, mma[0],
-                        mincov, homo_t, het_t, debug, contig_omit, cnv, nv_cmd)
+                        mincov, homo_t, het_t, debug, contig_omit, cnv, nv_cmd, sam)
     run.bam_parse_detect()
     run.coverage_stats()
     print('Done')
@@ -441,6 +461,7 @@ def main():
         #               total_gsize, contig_len_dict, score_threshold, file_path, input_name, ref_path, ref_name, mma[0],
         #               mincov, homo_t, het_t, debug, contig_omit, cnv, run.rlendict, run.total_subdata, run.total_out,
         #               run.out_nn, [])  #
+        run.sam = None # Remove pysam instance 
         with open(os.path.join(wk_dir, "nv_run.pkl"), "wb") as f:
             #pickle.dump(saveobjects, f)
             pickle.dump(run, f) #

diff --git a/nanovar/nv_alt_seq.py b/nanovar/nv_alt_seq.py
@@ -0,0 +1,85 @@
+"""
+Extract alternative sequences
+
+Copyright (C) 2024 Tham Cheng Yong
+
+This file is part of NanoVar.
+
+NanoVar is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+NanoVar is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with NanoVar.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+import os
+import logging
+from pybedtools import BedTool
+from Bio.Seq import Seq
+
+def get_alt_seq(wk_dir, out_nn, ref_path):
+    bed_str = ''
+    ins_id_seq = {}
+    ins_read_seq = get_ins_seq(wk_dir)
+    for line in out_nn:
+       sv_type = line.split('\t')[3].split(' ')[0]
+       sv_id = line.split('\t')[6].split('~')[0]
+       strand = line.split('\t')[3].split('~')[1].split(',')[0]
+       bed_line = make_bed(line, sv_type, sv_id)
+       bed_str += bed_line
+       if sv_type in ['Nov_Ins', 'E-Nov_Ins_bp', 'S-Nov_Ins_bp']:
+           ins_seq = ins_seq_except(ins_read_seq, line.split('\t')[8])
+           if strand == '+':
+               ins_id_seq[sv_id] = ins_seq
+           elif strand == '-':
+               ins_id_seq[sv_id] = str(Seq(ins_seq).reverse_complement())
+           else:
+               raise Exception('{} strand symbol invalid'.format(strand))
+    bed = BedTool(bed_str, from_string=True)
+    fasta = bed.sequence(fi=ref_path, nameOnly=True)
+    alt_seq = {}
+    with open(fasta.seqfn) as f:
+        for r in f:
+            alt_seq[r.strip('>\n')] = next(f).upper().strip()
+    for i in ins_id_seq:
+        alt_seq[i] = alt_seq[i] + ins_id_seq[i]
+    return alt_seq
+
+def make_bed(line, sv_type, sv_id):
+    data = line.split('\t')
+    chrm = data[6].split('~')[1].split(':')[0]
+    if sv_type in ['Inter-Ins(1)', 'Inter-Ins(2)', 'InterTx']:
+        end = int(data[6].split('~')[1].split(':')[1])
+        start = end - 1
+    elif sv_type == 'Del':
+        start = int(data[6].split('~')[1].split(':')[1].split('-')[0]) - 1
+        end = int(data[6].split('~')[1].split(':')[1].split('-')[1])
+    else:  # ['Nov_Ins', 'E-Nov_Ins_bp', 'S-Nov_Ins_bp', 'Inv', 'Inv(1)', 'Inv(2)', 'TDupl', 'Intra-Ins', 'Intra-Ins(1)', 'Intra-Ins(2)']
+        end = int(data[6].split('~')[1].split(':')[1].split('-')[0])
+        start = end - 1
+    start = max(start, 0)
+    end = max(end, 1)
+    bed_line = '\t'.join([chrm, str(start), str(end), sv_id]) + '\n'
+    return bed_line
+
+def get_ins_seq(wk_dir):
+    with open(os.path.join(wk_dir, 'ins_seq.fa')) as f:
+        ins_read_seq = {}
+        for r in f:
+            id = r.split('::')[0].strip('>')
+            ins_read_seq[id] = next(f).strip()
+        return ins_read_seq
+
+def ins_seq_except(ins_read_seq, read_id):
+    try:
+        return ins_read_seq[read_id]
+    except KeyError:  # Not found in ins_seq.fa, below score threshold
+        return 'N'
+