Merge pull request #98 from cytham/v1.8.2-dev

v1.8.2
cytham · Jan 6, 2025 · e17398b · e17398b
2 parents d8a746b + 3edf8d4
commit e17398b
Show file tree

Hide file tree

Showing 7 changed files with 156 additions and 71 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -4,6 +4,10 @@ NanoVar Changelog
 Release Summary:
 
 
+Version 1.8.2 - Jan 6, 2025
+    * Added '--sv_bam_out' option to output SV-supporting reads in BAM format, with SV-IDs labeled on the 'nv' tag
+
+
 Version 1.8.1 - Sep 29, 2024
     * Patch to restrict NumPy version to <2.0.0 for TF compatibility
 

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-## Please note: Current v1.8.1 not compatible with Tensorflow >= 2.16.0, please downgrade to 2.15.1
+## Please note: Current v1.8.2 not compatible with Tensorflow >= 2.16.0, please downgrade to 2.15.1
 
 `pip install tensorflow-cpu==2.15.1`
 
@@ -46,7 +46,7 @@ nanovar [Options] -t 24 -f hg38 sample.fq/sample.bam ref.fa working_dir
 | :--- | :--- | :--- |
 | `-t` | num_threads | Indicate number of CPU threads to use |
 | `-f` (Optional) | gap_file (Optional) | Choose built-in gap BED file or specify own file to exclude gap regions in the reference genome. Built-in gap files include: hg19, hg38 and mm10 |
-| - | sample.fq/sample.bam | Input long-read FASTA/FASTQ file or mapped BAM file |
+| - | sample.fq/sample.bam/sample.cram | Input long-read FASTA/FASTQ file or mapped BAM/CRAM file |
 | - | ref.fa | Input reference genome in FASTA format |
 | - | working_dir | Specify working directory |
 
@@ -62,25 +62,23 @@ For more information, see [wiki](https://github.com/cytham/nanovar/wiki).
 
 ### Full usage
 ```
-usage: nanovar [options] [FASTQ/FASTA/BAM] [REFERENCE_GENOME] [WORK_DIRECTORY]
+usage: nanovar [options] [FASTQ/FASTA/BAM/CRAM] [REFERENCE_GENOME] [WORK_DIRECTORY]
 
-NanoVar is a neural network enhanced structural variant (SV) caller that handles low-depth long-read sequencing data.
+NanoVar is a long-read structural variant (SV) caller.
 
 positional arguments:
-  [FASTQ/FASTA/BAM]     path to long reads or mapped BAM file.
-                        Formats: fasta/fa/fa.gzip/fa.gz/fastq/fq/fq.gzip/fq.gz or .bam
-  [reference_genome]    path to reference genome in FASTA. Genome indexes created
+  [FASTQ/FASTA/BAM/CRAM]
+                        Path to long reads or mapped BAM/CRAM file.
+                        Formats: fasta/fa/fa.gzip/fa.gz/fastq/fq/fq.gzip/fq.gz/bam/cram
+  [reference_genome]    Path to reference genome in FASTA. Genome indexes created
                         will overwrite indexes created by other aligners such as bwa.
-  [work_directory]      path to work directory. Directory will be created
+  [work_directory]      Path to work directory. Directory will be created
                         if it does not exist.
 
 options:
   -h, --help            show this help message and exit
-  --cnv hg38            also detects large genomic copy-number variations
-                        using CytoCAD (e.g. loss/gain of whole chromosomes).
-                        Only works with hg38 genome assembly. Please state 'hg38' [None]
   -x str, --data_type str
-                        type of long-read data [ont]
+                        Type of long-read data [ont]
                         ont - Oxford Nanopore Technologies
                         pacbio-clr - Pacific Biosciences CLR
                         pacbio-ccs - Pacific Biosciences CCS
@@ -89,35 +87,37 @@ options:
                         (e.g. telomeres and centromeres) Either specify name of in-built
                         reference genome filter (i.e. hg38, hg19, mm10) or provide full
                         path to own BED file.
-  --annotate_ins str    enable annotation of INS with NanoINSight,
+  --annotate_ins str    Enable annotation of INS with NanoINSight,
                         please specify species of sample [None]
                         Currently supported species are:
                         'human', 'mouse', and 'rattus'.
-  -c int, --mincov int  minimum number of reads required to call a breakend [4]
-  -l int, --minlen int  minimum length of SV to be detected [25]
+  -c int, --mincov int  Minimum number of reads required to call a breakend [4]
+  -l int, --minlen int  Minimum length of SV to be detected [25]
   -p float, --splitpct float
-                        minimum percentage of unmapped bases within a long read
+                        Minimum percentage of unmapped bases within a long read
                         to be considered as a split-read. 0.05<=p<=0.50 [0.05]
   -a int, --minalign int
-                        minimum alignment length for single alignment reads [200]
-  -b int, --buffer int  nucleotide length buffer for SV breakend clustering [50]
+                        Minimum alignment length for single alignment reads [200]
+  -b int, --buffer int  Nucleotide length buffer for SV breakend clustering [50]
   -s float, --score float
-                        score threshold for defining PASS/FAIL SVs in VCF [1.0]
+                        Score threshold for defining PASS/FAIL SVs in VCF [1.0]
                         Default score 1.0 was estimated from simulated analysis.
-  --homo float          lower limit of a breakend read ratio to classify a homozygous state [0.75]
+  --homo float          Lower limit of a breakend read ratio to classify a homozygous state [0.75]
                         (i.e. Any breakend with homo<=ratio<=1.00 is classified as homozygous)
-  --hetero float        lower limit of a breakend read ratio to classify a heterozygous state [0.35]
+  --hetero float        Lower limit of a breakend read ratio to classify a heterozygous state [0.35]
                         (i.e. Any breakend with hetero<=ratio<homo is classified as heterozygous)
-  --debug               run in debug mode
-  -v, --version         show version and exit
-  -q, --quiet           hide verbose
+  --sv_bam_out          Outputs a BAM file containing only SV-supporting reads with
+                        their corresponding SV-ID(s) stored in the "nv" tag separated by comma.
+  --debug               Run in debug mode
+  -v, --version         Show version and exit
+  -q, --quiet           Hide verbose
   -t int, --threads int
-                        number of available threads for use [1]
-  --model path          specify path to custom-built model
-  --mm path             specify path to 'minimap2' executable
-  --st path             specify path to 'samtools' executable
-  --ma path             specify path to 'mafft' executable for NanoINSight
-  --rm path             specify path to 'RepeatMasker' executable for NanoINSight
+                        Number of available threads for use [1]
+  --model path          Specify path to custom-built model
+  --mm path             Specify path to 'minimap2' executable
+  --st path             Specify path to 'samtools' executable
+  --ma path             Specify path to 'mafft' executable for NanoINSight
+  --rm path             Specify path to 'RepeatMasker' executable for NanoINSight
 ```
 
 ### Operating system

diff --git a/src/nanovar/nanovar.py b/src/nanovar/nanovar.py
@@ -381,6 +381,16 @@ def main():
         con_fasta, threads_per_job = nanoinsight.create_cons(vcf, wk_dir, fasta_dir, id_seq, threads, mafft_exe, batch_size=100, num_parallel_workers=5)
         nanoinsight.rep_annote(wk_dir, con_fasta, threads_per_job, species, repmask_exe)
         print('Done')
+
+    # Output SV-supporting BAM
+    if args.sv_bam_out:
+        print(datetime.now().strftime("[%d/%m/%Y %H:%M:%S]"), '- Creating SV-supporting BAM - ', end='', flush=True)
+        logging.info('Creating SV-supporting BAM')
+        from .nv_supp_bam import create_sv_supp_bam
+        vcf = os.path.join(wk_dir, '%s.nanovar.pass.vcf' % input_name)
+        sv_sup = os.path.join(wk_dir, 'sv_support_reads.tsv')
+        create_sv_supp_bam(vcf, sv_sup, bam_path, wk_dir, input_type, ref_path)
+        print('Done')
 
     # Delete temporary fasta file
     if not archivefasta:

diff --git a/src/nanovar/nv_input.py b/src/nanovar/nv_input.py
@@ -27,8 +27,7 @@
 
 # Parse input
 def input_parser(args=sys.argv[1:]):
-    parser = argparse.ArgumentParser(description="NanoVar is a neural network enhanced structural variant (SV) caller that \
-handles low-depth long-read sequencing data.",
+    parser = argparse.ArgumentParser(description="NanoVar is a long-read structural variant (SV) caller.",
                                      formatter_class=argparse.RawTextHelpFormatter, usage=msg())  # RawDescriptionHelpFormatter)
 
     def restrict_float(f):
@@ -38,29 +37,28 @@ def restrict_float(f):
         return f
 
     parser.add_argument("input", type=str,
-                        metavar="[FASTQ/FASTA/BAM]",
-                        help="""path to long reads or mapped BAM file.
-Formats: fasta/fa/fa.gzip/fa.gz/fastq/fq/fq.gzip/fq.gz or .bam""")
+                        metavar="[FASTQ/FASTA/BAM/CRAM]",
+                        help="""Path to long reads or mapped BAM/CRAM file.
+Formats: fasta/fa/fa.gzip/fa.gz/fastq/fq/fq.gzip/fq.gz/bam/cram""")
 
     parser.add_argument("ref", type=str,
                         metavar="[reference_genome]",
-                        help="""path to reference genome in FASTA. Genome indexes created 
+                        help="""Path to reference genome in FASTA. Genome indexes created 
 will overwrite indexes created by other aligners such as bwa.""")
 
     parser.add_argument("dir", type=str,
                         metavar="[work_directory]",
-                        help="""path to work directory. Directory will be created 
+                        help="""Path to work directory. Directory will be created 
 if it does not exist.""")
 
     parser.add_argument("--cnv", type=str, metavar="hg38",
                         default=None,
-                        help="""also detects large genomic copy-number variations 
-using CytoCAD (e.g. loss/gain of whole chromosomes). 
-Only works with hg38 genome assembly. Please state 'hg38' [None]""")
+                        help=argparse.SUPPRESS)
+    # help="""Detects large genomic copy-number variations using CytoCAD (e.g. loss/gain of whole chromosomes). Only works with hg38 genome assembly. Please state 'hg38' [None]"""
 
     parser.add_argument("-x", "--data_type", type=str, metavar="str",
                         default='ont',
-                        help="""type of long-read data [ont]
+                        help="""Type of long-read data [ont]
 ont - Oxford Nanopore Technologies
 pacbio-clr - Pacific Biosciences CLR
 pacbio-ccs - Pacific Biosciences CCS""")
@@ -73,88 +71,92 @@ def restrict_float(f):
 
     parser.add_argument("--annotate_ins", type=str, metavar="str",
                         default=None,
-                        help="""enable annotation of INS with NanoINSight, 
+                        help="""Enable annotation of INS with NanoINSight, 
 please specify species of sample [None]
 Currently supported species are:
 'human', 'mouse', and 'rattus'.
 """)
 
     parser.add_argument("-c", "--mincov", type=int, metavar="int",
                         default=4,
-                        help="minimum number of reads required to call a breakend [4]")
+                        help="Minimum number of reads required to call a breakend [4]")
 
     parser.add_argument("-l", "--minlen", type=int, metavar="int",
                         default=25,
-                        help="minimum length of SV to be detected [25]")
+                        help="Minimum length of SV to be detected [25]")
 
     parser.add_argument("-p", "--splitpct", type=restrict_float, metavar="float",
                         default=0.05,
-                        help="""minimum percentage of unmapped bases within a long read 
+                        help="""Minimum percentage of unmapped bases within a long read 
 to be considered as a split-read. 0.05<=p<=0.50 [0.05]""")
 
     parser.add_argument("-a", "--minalign", type=int, metavar="int",
                         default=200,
-                        help="minimum alignment length for single alignment reads [200]")
+                        help="Minimum alignment length for single alignment reads [200]")
 
     parser.add_argument("-b", "--buffer", type=int, metavar="int",
                         default=50,
-                        help="nucleotide length buffer for SV breakend clustering [50]")
+                        help="Nucleotide length buffer for SV breakend clustering [50]")
 
     parser.add_argument("-s", "--score", type=float, metavar="float",
                         default=1.0,
-                        help="""score threshold for defining PASS/FAIL SVs in VCF [1.0]
+                        help="""Score threshold for defining PASS/FAIL SVs in VCF [1.0]
 Default score 1.0 was estimated from simulated analysis. """)
 
     parser.add_argument("--homo", type=float, metavar="float",
                         default=0.75,
-                        help="""lower limit of a breakend read ratio to classify a homozygous state [0.75]
+                        help="""Lower limit of a breakend read ratio to classify a homozygous state [0.75]
 (i.e. Any breakend with homo<=ratio<=1.00 is classified as homozygous)""")
 
     parser.add_argument("--hetero", type=float, metavar="float",
                         default=0.35,
-                        help="""lower limit of a breakend read ratio to classify a heterozygous state [0.35]
+                        help="""Lower limit of a breakend read ratio to classify a heterozygous state [0.35]
 (i.e. Any breakend with hetero<=ratio<homo is classified as heterozygous)""")
 
+    parser.add_argument("--sv_bam_out", action='store_true',
+                        help="""Outputs a BAM file containing only SV-supporting reads with 
+their corresponding SV-ID(s) stored in the "nv" tag separated by comma.""")
+
     parser.add_argument("--debug", action='store_true',
-                        help="run in debug mode")
+                        help="Run in debug mode")
 
     # parser.add_argument("--force", action='store_true',
-    #                     help="run full pipeline (i.e. do not skip index generation)")
+    #                     help="Run full pipeline (i.e. do not skip index generation)")
 
     parser.add_argument("-v", "--version", action='version',
                         version=__version__,
-                        help="show version and exit")
+                        help="Show version and exit")
 
     parser.add_argument("-q", "--quiet", action='store_true',
-                        help="hide verbose")
+                        help="Hide verbose")
 
     parser.add_argument("-t", "--threads", type=int, metavar="int",
                         default=1,
-                        help="number of available threads for use [1]")
+                        help="Number of available threads for use [1]")
 
     parser.add_argument("--model", type=str, metavar="path",
-                        help="specify path to custom-built model")
+                        help="Specify path to custom-built model")
 
     parser.add_argument("--mm", type=str, metavar="path",
-                        help="specify path to 'minimap2' executable")
+                        help="Specify path to 'minimap2' executable")
 
     parser.add_argument("--st", type=str, metavar="path",
-                        help="specify path to 'samtools' executable")
+                        help="Specify path to 'samtools' executable")
 
     parser.add_argument("--ma", type=str, metavar="path",
-                        help="specify path to 'mafft' executable for NanoINSight")
+                        help="Specify path to 'mafft' executable for NanoINSight")
 
     parser.add_argument("--rm", type=str, metavar="path",
-                        help="specify path to 'RepeatMasker' executable for NanoINSight")
+                        help="Specify path to 'RepeatMasker' executable for NanoINSight")
 
     # parser.add_argument("--mdb", type=str, metavar="path",
-    #                     help="specify path to 'makeblastdb' executable")
+    #                     help="Specify path to 'makeblastdb' executable")
 
     # parser.add_argument("--wmk", type=str, metavar="path",
-    #                     help="specify path to 'windowmasker' executable")
+    #                     help="Specify path to 'windowmasker' executable")
 
     # parser.add_argument("--hsb", type=str, metavar="path",
-    #                     help="specify path to 'hs-blastn' executable")
+    #                     help="Specify path to 'hs-blastn' executable")
 
     parser.add_argument("--pickle", action='store_true',
                         help=argparse.SUPPRESS)
@@ -178,4 +180,4 @@ def gzip_check(path):
 
 # Custom usage message
 def msg():
-    return "nanovar [options] [FASTQ/FASTA/BAM] [REFERENCE_GENOME] [WORK_DIRECTORY]"
+    return "nanovar [options] [FASTQ/FASTA/BAM/CRAM] [REFERENCE_GENOME] [WORK_DIRECTORY]"
diff --git a/src/nanovar/nv_report.py b/src/nanovar/nv_report.py
@@ -512,8 +512,8 @@ def create_html(data, fwd, wk_dir, vcf_path, timenow, read_name, read_path, ref_
             <br>
             <br>
             <figure>
-                <h4 style="text-align:center;"><u>5. Scatter plot between SV confidence score and read depth</u></h4>
-                <img src=""" + '"' + fwd + """/scatter2.png" alt="5. Scatter plot between SV confidence score and read depth" 
+                <h4 style="text-align:center;"><u>5. Scatter plot between SV confidence score and supporting read depth</u></h4>
+                <img src=""" + '"' + fwd + """/scatter2.png" alt="5. Scatter plot between SV confidence score and supporting read depth" 
                 title=""" + '"' + fwd + '/scatter2.png' + '"' + """>
             </figure>
             <br>
@@ -602,11 +602,11 @@ def create_html(data, fwd, wk_dir, vcf_path, timenow, read_name, read_path, ref_
             mtype = mtype + ';base64'
             en_data = base64.b64encode(data).decode()
             tag['src'] = "data:{},{}".format(mtype, en_data)
-    packed_html = str(soup)
+    # packed_html = str(soup)
     with open(os.path.join(wk_dir, f"{read_name}.nanovar.pass.report.html"), "w", encoding = 'utf-8') as file: 
         file.write(str(soup.prettify()))
     # packed_html = htmlark.convert_page(os.path.join(wk_dir, '%s.nanovar.pass.report-tmp.html' % read_name), ignore_errors=True)
-    html_final = open(os.path.join(wk_dir, '%s.nanovar.pass.report.html' % read_name), 'w')
-    _ = html_final.write(packed_html)
-    html_final.close()
+    # html_final = open(os.path.join(wk_dir, '%s.nanovar.pass.report.html' % read_name), 'w')
+    # _ = html_final.write(packed_html)
+    # html_final.close()
     os.remove(os.path.join(wk_dir, '%s.nanovar.pass.report-tmp.html' % read_name))