Skip to content

Commit

Permalink
Add alt seq and cram support (#87)
Browse files Browse the repository at this point in the history
* Modify SV ID

nv_SV?-????? to NV.DEL.SV?-?????

* Integrate SV support file generation

* Change lead read

Lead read selection now based on median SV size rather than largest SV size

* Removed ins_size_mod

* Create nv_alt_seq.py

* Add alt_seq

* Add ins_seq

* Add ins strandness

* Add alt_seq

* bump v1.7.0

* Fix wkdir error

* Fix alt seq var error

* Fix chr2 bug

* Fix ALT parsing

* Fix main sv bug

* Fix median size

* Fix key error

* Fix bed <0

* Fix bed <0

* Fix bed <0

* Fix bed <0

* Add cram

* Add refpath for cram

* Fix cram ref

* Fix pickle

* Update changelog
  • Loading branch information
cytham authored Jun 4, 2024
1 parent 75b8bf5 commit 8a7c5cb
Show file tree
Hide file tree
Showing 10 changed files with 630 additions and 618 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ NanoVar Changelog
Release Summary:


Version 1.7.0 - Jun 4, 2024
* Added alternative sequence for INS and DEL in VCF
* Added CRAM input support
* Modified SV ID format with addition of SV type, from nv_SV?-????? to NV.INS.SV?-?????
* SV size now takes the median size amongst clustered reads, instead of the largest


Version 1.6.2 - Apr 29, 2024
* Fixed bug where some supplementary alignments do not have primary alignments, in the case of a subsample BAM input

Expand Down
47 changes: 34 additions & 13 deletions nanovar/nanovar
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ def main():
filename = os.path.basename(file_path)
read_suffix = ['.fa', '.fq', '.fasta', '.fastq', '.fa.gzip', '.fq.gzip', '.fa.gz', '.fq.gz', '.fasta.gz', '.fastq.gz']
bam_suffix = '.bam'
cram_suffix = '.cram'
contig_list = []
if any(filename.lower().endswith(s) for s in read_suffix):
input_name = os.path.basename(file_path).rsplit('.f', 1)[0]
Expand Down Expand Up @@ -210,9 +211,25 @@ def main():
except AssertionError:
logging.critical("Error: Input BAM file is not a BAM file.")
raise Exception("Error: Input BAM file is not a BAM file.")
elif filename.lower().endswith(cram_suffix):
save = pysam.set_verbosity(0) # Suppress index missing warning
sam = pysam.AlignmentFile(file_path, "rc", reference_filename=ref_path)
pysam.set_verbosity(save) # Revert verbosity level
try:
assert sam.is_cram, "Error: Input CRAM file is not a CRAM file."
input_name = os.path.basename(file_path).rsplit('.cram', 1)[0]
input_type = 'cram'
fastx_check = []
# Get CRAM contigs from header
header = sam.header.to_dict()
for h in header['SQ']:
contig_list.append(h['SN'])
except AssertionError:
logging.critical("Error: Input CRAM file is not a CRAM file.")
raise Exception("Error: Input CRAM file is not a CRAM file.")
else:
logging.critical("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam'")
raise Exception("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam'")
logging.critical("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam' or '.cram'")
raise Exception("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam' or '.cram'")

if gzip_check(ref_path):
logging.critical("Error: Input reference file is gzipped, please unzip it")
Expand All @@ -235,7 +252,7 @@ def main():
logging.info('Number of threads: %s\n' % str(threads))
if input_type == 'raw':
logging.info('Total number of reads in FASTQ/FASTA: %s\n' % str(fastx_check[1]))
elif input_type == 'bam':
elif input_type == 'bam' or input_type == 'cram':
logging.info('Total number of reads in FASTQ/FASTA: -\n')
logging.info('NanoVar started')

Expand All @@ -248,22 +265,22 @@ def main():
contig_len_dict[seq_record.id] = len(seq_record)
total_gsize += len(seq_record)

# Check if contig sizes from BAM is same as hg38 if CNV mode
# Check if contig sizes from BAM/CRAM is same as hg38 if CNV mode
if cnv:
for chrom in hg38_size_dict:
if chrom in contig_len_dict:
if contig_len_dict[chrom] != hg38_size_dict[chrom]:
raise Exception("Error: Contig %s in BAM (%i) has different length in hg38 genome assembly (%i)" %
raise Exception("Error: Contig %s in BAM/CRAM (%i) has different length in hg38 genome assembly (%i)" %
(chrom, contig_len_dict[chrom], hg38_size_dict[chrom]))
else:
raise Exception("Error: hg38 contig %s is absent in BAM" % chrom)
raise Exception("Error: hg38 contig %s is absent in BAM/CRAM" % chrom)

# Check BAM contigs in reference genome
if input_type == 'bam':
# Check BAM/CRAM contigs in reference genome
if input_type == 'bam' or input_type == 'cram':
for c in contig_list:
if c not in contig_len_dict:
logging.critical("Error: Contig %s in BAM is absent in reference genome" % c)
raise Exception("Error: Contig %s in BAM is absent in reference genome" % c)
logging.critical("Error: Contig %s in BAM/CRAM is absent in reference genome" % c)
raise Exception("Error: Contig %s in BAM/CRAM is absent in reference genome" % c)

# Check contig id for invalid symbols
contig_omit = checkcontignames(contig_len_dict)
Expand Down Expand Up @@ -326,9 +343,12 @@ def main():
print(datetime.now().strftime("[%d/%m/%Y %H:%M:%S]"), '- Read mapping using minimap2 - ', end='', flush=True)
mma = align_mm(ref_path, file_path, wk_dir, input_name, ref_name, threads_mm, mm, data_type, st)
bam_path = mma[1]
save = pysam.set_verbosity(0) # Suppress index missing warning
sam = pysam.AlignmentFile(bam_path, "rb")
pysam.set_verbosity(save) # Revert verbosity level
print('Done')
elif input_type == 'bam':
logging.info('Input BAM file, skipping minimap2 alignment')
elif input_type == 'bam' or input_type == 'cram':
logging.info('Input BAM/CRAM file, skipping minimap2 alignment')
mma = ['-', '']
bam_path = file_path
# else:
Expand All @@ -354,7 +374,7 @@ def main():
logging.info('Parsing BAM and detecting SVs')
run = VariantDetect(wk_dir, bam_path, splitpct, minalign, filter_path, minlen, buff, model_path,
total_gsize, contig_len_dict, score_threshold, file_path, input_name, ref_path, ref_name, mma[0],
mincov, homo_t, het_t, debug, contig_omit, cnv, nv_cmd)
mincov, homo_t, het_t, debug, contig_omit, cnv, nv_cmd, sam)
run.bam_parse_detect()
run.coverage_stats()
print('Done')
Expand Down Expand Up @@ -441,6 +461,7 @@ def main():
# total_gsize, contig_len_dict, score_threshold, file_path, input_name, ref_path, ref_name, mma[0],
# mincov, homo_t, het_t, debug, contig_omit, cnv, run.rlendict, run.total_subdata, run.total_out,
# run.out_nn, []) #
run.sam = None # Remove pysam instance
with open(os.path.join(wk_dir, "nv_run.pkl"), "wb") as f:
#pickle.dump(saveobjects, f)
pickle.dump(run, f) #
Expand Down
85 changes: 85 additions & 0 deletions nanovar/nv_alt_seq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""
Extract alternative sequences
Copyright (C) 2024 Tham Cheng Yong
This file is part of NanoVar.
NanoVar is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
NanoVar is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with NanoVar. If not, see <https://www.gnu.org/licenses/>.
"""

import os
import logging
from pybedtools import BedTool
from Bio.Seq import Seq

def get_alt_seq(wk_dir, out_nn, ref_path):
bed_str = ''
ins_id_seq = {}
ins_read_seq = get_ins_seq(wk_dir)
for line in out_nn:
sv_type = line.split('\t')[3].split(' ')[0]
sv_id = line.split('\t')[6].split('~')[0]
strand = line.split('\t')[3].split('~')[1].split(',')[0]
bed_line = make_bed(line, sv_type, sv_id)
bed_str += bed_line
if sv_type in ['Nov_Ins', 'E-Nov_Ins_bp', 'S-Nov_Ins_bp']:
ins_seq = ins_seq_except(ins_read_seq, line.split('\t')[8])
if strand == '+':
ins_id_seq[sv_id] = ins_seq
elif strand == '-':
ins_id_seq[sv_id] = str(Seq(ins_seq).reverse_complement())
else:
raise Exception('{} strand symbol invalid'.format(strand))
bed = BedTool(bed_str, from_string=True)
fasta = bed.sequence(fi=ref_path, nameOnly=True)
alt_seq = {}
with open(fasta.seqfn) as f:
for r in f:
alt_seq[r.strip('>\n')] = next(f).upper().strip()
for i in ins_id_seq:
alt_seq[i] = alt_seq[i] + ins_id_seq[i]
return alt_seq

def make_bed(line, sv_type, sv_id):
data = line.split('\t')
chrm = data[6].split('~')[1].split(':')[0]
if sv_type in ['Inter-Ins(1)', 'Inter-Ins(2)', 'InterTx']:
end = int(data[6].split('~')[1].split(':')[1])
start = end - 1
elif sv_type == 'Del':
start = int(data[6].split('~')[1].split(':')[1].split('-')[0]) - 1
end = int(data[6].split('~')[1].split(':')[1].split('-')[1])
else: # ['Nov_Ins', 'E-Nov_Ins_bp', 'S-Nov_Ins_bp', 'Inv', 'Inv(1)', 'Inv(2)', 'TDupl', 'Intra-Ins', 'Intra-Ins(1)', 'Intra-Ins(2)']
end = int(data[6].split('~')[1].split(':')[1].split('-')[0])
start = end - 1
start = max(start, 0)
end = max(end, 1)
bed_line = '\t'.join([chrm, str(start), str(end), sv_id]) + '\n'
return bed_line

def get_ins_seq(wk_dir):
with open(os.path.join(wk_dir, 'ins_seq.fa')) as f:
ins_read_seq = {}
for r in f:
id = r.split('::')[0].strip('>')
ins_read_seq[id] = next(f).strip()
return ins_read_seq

def ins_seq_except(ins_read_seq, read_id):
try:
return ins_read_seq[read_id]
except KeyError: # Not found in ins_seq.fa, below score threshold
return 'N'

Loading

0 comments on commit 8a7c5cb

Please sign in to comment.