diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 052a8c8..55e163f 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -4,6 +4,10 @@ NanoVar Changelog Release Summary: +Version 1.8.2 - Jan 6, 2025 + * Added '--sv_bam_out' option to output SV-supporting reads in BAM format, with SV-IDs labeled on the 'nv' tag + + Version 1.8.1 - Sep 29, 2024 * Patch to restrict NumPy version to <2.0.0 for TF compatibility diff --git a/README.md b/README.md index 2406750..d1233dc 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -## Please note: Current v1.8.1 not compatible with Tensorflow >= 2.16.0, please downgrade to 2.15.1 +## Please note: Current v1.8.2 not compatible with Tensorflow >= 2.16.0, please downgrade to 2.15.1 `pip install tensorflow-cpu==2.15.1` @@ -46,7 +46,7 @@ nanovar [Options] -t 24 -f hg38 sample.fq/sample.bam ref.fa working_dir | :--- | :--- | :--- | | `-t` | num_threads | Indicate number of CPU threads to use | | `-f` (Optional) | gap_file (Optional) | Choose built-in gap BED file or specify own file to exclude gap regions in the reference genome. Built-in gap files include: hg19, hg38 and mm10 | -| - | sample.fq/sample.bam | Input long-read FASTA/FASTQ file or mapped BAM file | +| - | sample.fq/sample.bam/sample.cram | Input long-read FASTA/FASTQ file or mapped BAM/CRAM file | | - | ref.fa | Input reference genome in FASTA format | | - | working_dir | Specify working directory | @@ -62,25 +62,23 @@ For more information, see [wiki](https://github.com/cytham/nanovar/wiki). ### Full usage ``` -usage: nanovar [options] [FASTQ/FASTA/BAM] [REFERENCE_GENOME] [WORK_DIRECTORY] +usage: nanovar [options] [FASTQ/FASTA/BAM/CRAM] [REFERENCE_GENOME] [WORK_DIRECTORY] -NanoVar is a neural network enhanced structural variant (SV) caller that handles low-depth long-read sequencing data. +NanoVar is a long-read structural variant (SV) caller. positional arguments: - [FASTQ/FASTA/BAM] path to long reads or mapped BAM file. - Formats: fasta/fa/fa.gzip/fa.gz/fastq/fq/fq.gzip/fq.gz or .bam - [reference_genome] path to reference genome in FASTA. Genome indexes created + [FASTQ/FASTA/BAM/CRAM] + Path to long reads or mapped BAM/CRAM file. + Formats: fasta/fa/fa.gzip/fa.gz/fastq/fq/fq.gzip/fq.gz/bam/cram + [reference_genome] Path to reference genome in FASTA. Genome indexes created will overwrite indexes created by other aligners such as bwa. - [work_directory] path to work directory. Directory will be created + [work_directory] Path to work directory. Directory will be created if it does not exist. options: -h, --help show this help message and exit - --cnv hg38 also detects large genomic copy-number variations - using CytoCAD (e.g. loss/gain of whole chromosomes). - Only works with hg38 genome assembly. Please state 'hg38' [None] -x str, --data_type str - type of long-read data [ont] + Type of long-read data [ont] ont - Oxford Nanopore Technologies pacbio-clr - Pacific Biosciences CLR pacbio-ccs - Pacific Biosciences CCS @@ -89,35 +87,37 @@ options: (e.g. telomeres and centromeres) Either specify name of in-built reference genome filter (i.e. hg38, hg19, mm10) or provide full path to own BED file. - --annotate_ins str enable annotation of INS with NanoINSight, + --annotate_ins str Enable annotation of INS with NanoINSight, please specify species of sample [None] Currently supported species are: 'human', 'mouse', and 'rattus'. - -c int, --mincov int minimum number of reads required to call a breakend [4] - -l int, --minlen int minimum length of SV to be detected [25] + -c int, --mincov int Minimum number of reads required to call a breakend [4] + -l int, --minlen int Minimum length of SV to be detected [25] -p float, --splitpct float - minimum percentage of unmapped bases within a long read + Minimum percentage of unmapped bases within a long read to be considered as a split-read. 0.05<=p<=0.50 [0.05] -a int, --minalign int - minimum alignment length for single alignment reads [200] - -b int, --buffer int nucleotide length buffer for SV breakend clustering [50] + Minimum alignment length for single alignment reads [200] + -b int, --buffer int Nucleotide length buffer for SV breakend clustering [50] -s float, --score float - score threshold for defining PASS/FAIL SVs in VCF [1.0] + Score threshold for defining PASS/FAIL SVs in VCF [1.0] Default score 1.0 was estimated from simulated analysis. - --homo float lower limit of a breakend read ratio to classify a homozygous state [0.75] + --homo float Lower limit of a breakend read ratio to classify a homozygous state [0.75] (i.e. Any breakend with homo<=ratio<=1.00 is classified as homozygous) - --hetero float lower limit of a breakend read ratio to classify a heterozygous state [0.35] + --hetero float Lower limit of a breakend read ratio to classify a heterozygous state [0.35] (i.e. Any breakend with hetero<=ratio
-

5. Scatter plot between SV confidence score and read depth

- 5. Scatter plot between SV confidence score and read depth5. Scatter plot between SV confidence score and supporting read depth + 5. Scatter plot between SV confidence score and supporting read depth

@@ -602,11 +602,11 @@ def create_html(data, fwd, wk_dir, vcf_path, timenow, read_name, read_path, ref_ mtype = mtype + ';base64' en_data = base64.b64encode(data).decode() tag['src'] = "data:{},{}".format(mtype, en_data) - packed_html = str(soup) + # packed_html = str(soup) with open(os.path.join(wk_dir, f"{read_name}.nanovar.pass.report.html"), "w", encoding = 'utf-8') as file: file.write(str(soup.prettify())) # packed_html = htmlark.convert_page(os.path.join(wk_dir, '%s.nanovar.pass.report-tmp.html' % read_name), ignore_errors=True) - html_final = open(os.path.join(wk_dir, '%s.nanovar.pass.report.html' % read_name), 'w') - _ = html_final.write(packed_html) - html_final.close() + # html_final = open(os.path.join(wk_dir, '%s.nanovar.pass.report.html' % read_name), 'w') + # _ = html_final.write(packed_html) + # html_final.close() os.remove(os.path.join(wk_dir, '%s.nanovar.pass.report-tmp.html' % read_name)) diff --git a/src/nanovar/nv_supp_bam.py b/src/nanovar/nv_supp_bam.py new file mode 100644 index 0000000..e9ac4cb --- /dev/null +++ b/src/nanovar/nv_supp_bam.py @@ -0,0 +1,69 @@ +""" +Generate SV-supporting reads BAM file. + +@author: asmaa, cytham + +# Test +wk_dir = '.' +vcf_file = 'sample.nanovar.pass.vcf' +sv_supp_tsv_file = 'sv_support_reads.tsv' +in_bam_path = 'input.bam' +input_type = 'bam' +ref_path = 'ref.fa' +vcf = os.path.join(wk_dir, vcf_file) +sv_supp_tsv = os.path.join(wk_dir, sv_supp_tsv_file) +create_sv_supp_bam(vcf, sv_supp_tsv, in_bam_path, wk_dir, input_type, ref_path) + +""" + +import os +import pandas as pd +import pysam + +def create_sv_supp_bam(vcf, sv_supp_tsv, in_bam_path, wk_dir, input_type, ref_path): + pass_ids = parse_pass_sv(vcf) + supp_dict = parse_supp_tsv(sv_supp_tsv, pass_ids) + in_bam = input_type_func(in_bam_path, input_type, ref_path) + out_bam = os.path.join(wk_dir, "sv_support_reads.bam") + with pysam.AlignmentFile(out_bam, "wb", template=in_bam) as bam_out: + for seg in in_bam: + try: + id = supp_dict[seg.query_name] # If segment is SV-supporting read + seg.set_tag('nv', ','.join(id), value_type='Z', replace=True) # Set SV-ID tag + bam_out.write(seg) + except KeyError: + pass + bam_out.close() + +def parse_supp_tsv(sv_sup_tsv, pass_ids): + supp_df = pd.read_csv(sv_sup_tsv, sep='\t', header=0) + supp_df.rename(columns = {supp_df.columns[1]: 'readID'}, inplace = True) + supp_dict = {} + for index, row in supp_df.iterrows(): + for _id in row['readID'].split(','): + try: + _ = pass_ids[row['SV-ID']] # If passed SV-ID + id = _id.split('~')[0] + try: + supp_dict[id].add(row['SV-ID']) + except KeyError: + supp_dict[id] = set() + supp_dict[id].add(row['SV-ID']) + except KeyError: + pass + return supp_dict + +def parse_pass_sv(vcf): + pass_ids = {} + with open(vcf) as v: + for line in v: + if not line.startswith('#'): + pass_ids[line.split('\t')[2]] = 0 + return pass_ids + +def input_type_func(file_path, input_type, ref_path): + if input_type in ['bam', 'raw']: + sam = pysam.AlignmentFile(file_path, "rb") + elif input_type == 'cram': + sam = pysam.AlignmentFile(file_path, "rc", reference_filename=ref_path) + return sam diff --git a/src/nanovar/version.py b/src/nanovar/version.py index 2d986fc..320141d 100644 --- a/src/nanovar/version.py +++ b/src/nanovar/version.py @@ -1 +1 @@ -__version__ = "1.8.1" +__version__ = "1.8.2"