diff --git a/CHANGELOG.md b/CHANGELOG.md index ea1580e..865419e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ All notable changes to MoGAAAP will be documented in this file. ### Added - Add wrapper script to initialise, configure and run MoGAAAP (#80). - Add option to use the pipeline for already existing assemblies (#83). +- Add flye as an alternative assembler (#93). ### Changed - Update sample sheet to allow using the pipeline for already existing assemblies (#83). diff --git a/config/example.yaml b/config/example.yaml index aa15984..e309c58 100644 --- a/config/example.yaml +++ b/config/example.yaml @@ -10,7 +10,7 @@ samples: "config/example.tsv" # Assembly settings -assembler: "hifiasm" #supported assemblers: hifiasm, verkko +assembler: "hifiasm" #supported assemblers: hifiasm, verkko, flye # Scaffolding settings scaffolder: "ntjoin" #supported scaffolders: ntjoin, ragtag diff --git a/config/examples/arabidopsis.yaml b/config/examples/arabidopsis.yaml index 7be8230..79c3df6 100644 --- a/config/examples/arabidopsis.yaml +++ b/config/examples/arabidopsis.yaml @@ -10,7 +10,7 @@ samples: "config/samples.tsv" # Assembly settings -assembler: "hifiasm" #supported assemblers: hifiasm, verkko +assembler: "hifiasm" #supported assemblers: hifiasm, verkko, flye # All filtering and scaffolding parameters min_contig_len: 10000 @@ -53,12 +53,12 @@ helixer_model: "land_plant" #choose between land_plant, vertebrate, invertebrate helixer_max_gene_length: 64152 # Quality assessment settings -k_qa: 21 #optimal k-mer size for quality assessment -gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx -pantools_grouping: 3 #grouping relaxation setting for pantools -odb: "brassicales_odb10" #ODB database for busco -kraken2_nt: "/path/to/kraken2/nt" #kraken2 database -OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs +k_qa: 21 #optimal k-mer size for quality assessment +gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx +pantools_grouping: 3 #grouping relaxation setting for pantools +odb: "brassicales_odb10" #ODB database for busco +kraken2_nt: "/path/to/kraken2/nt" #kraken2 database +OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs # A section to define groups of accessions for plotting together in report. This # list is order sensitive and will only be used for QC. @@ -98,7 +98,7 @@ set: - 45_meh_0 # General settings; only change if you know what you are doing -jvm: "-Xmx100g -Xms100g" #java virtual machine settings -tmpdir: "/dev/shm/tmp_arabidopsis" #temporary directory for fast IO -nucmer_maxgap: 1000 #maximum gap size for nucmer -nucmer_minmatch: 1000 #minimum match size for nucmer +jvm: "-Xmx100g -Xms100g" #java virtual machine settings +tmpdir: "/dev/shm/tmp_arabidopsis" #temporary directory for fast IO +nucmer_maxgap: 1000 #maximum gap size for nucmer +nucmer_minmatch: 1000 #minimum match size for nucmer diff --git a/config/examples/grape.yaml b/config/examples/grape.yaml index ca8552e..8a35b86 100644 --- a/config/examples/grape.yaml +++ b/config/examples/grape.yaml @@ -10,7 +10,7 @@ samples: "config/samples.tsv" # Assembly settings -assembler: "hifiasm" #supported assemblers: hifiasm, verkko +assembler: "hifiasm" #supported assemblers: hifiasm, verkko, flye # All filtering and scaffolding parameters min_contig_len: 10000 @@ -65,12 +65,12 @@ helixer_model: "land_plant" #choose between land_plant, vertebrate, invertebrate helixer_max_gene_length: 64152 # Quality assessment settings -k_qa: 21 #optimal k-mer size for quality assessment -gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx -pantools_grouping: 3 #grouping relaxation setting for pantools -odb: "eudicots_odb10" #ODB database for busco -kraken2_nt: "/path/to/kraken2/nt" #kraken2 database -OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs +k_qa: 21 #optimal k-mer size for quality assessment +gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx +pantools_grouping: 3 #grouping relaxation setting for pantools +odb: "eudicots_odb10" #ODB database for busco +kraken2_nt: "/path/to/kraken2/nt" #kraken2 database +OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs # A section to define groups of accessions for plotting together in report. This # list is order sensitive and will only be used for QC. @@ -84,7 +84,7 @@ set: - Wolley # General settings; only change if you know what you are doing -jvm: "-Xmx100g -Xms100g" #java virtual machine settings -tmpdir: "/dev/shm/tmp_grape" #temporary directory for fast IO -nucmer_maxgap: 1000 #maximum gap size for nucmer -nucmer_minmatch: 1000 #minimum match size for nucmer +jvm: "-Xmx100g -Xms100g" #java virtual machine settings +tmpdir: "/dev/shm/tmp_grape" #temporary directory for fast IO +nucmer_maxgap: 1000 #maximum gap size for nucmer +nucmer_minmatch: 1000 #minimum match size for nucmer diff --git a/config/examples/human.yaml b/config/examples/human.yaml index d030c36..f5bfdad 100644 --- a/config/examples/human.yaml +++ b/config/examples/human.yaml @@ -10,7 +10,7 @@ samples: "config/samples.tsv" # Assembly settings -assembler: "hifiasm" #supported assemblers: hifiasm, verkko +assembler: "hifiasm" #supported assemblers: hifiasm, verkko, flye # All filtering and scaffolding parameters min_contig_len: 10000 @@ -71,12 +71,12 @@ helixer_model: "vertebrate" #choose between land_plant, vertebrate, invertebrate helixer_max_gene_length: 64152 # Quality assessment settings -k_qa: 21 #optimal k-mer size for quality assessment -gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx -pantools_grouping: 3 #grouping relaxation setting for pantools -odb: "primates_odb10" #ODB database for busco -kraken2_nt: "/path/to/kraken2/nt" #kraken2 database -OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs +k_qa: 21 #optimal k-mer size for quality assessment +gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx +pantools_grouping: 3 #grouping relaxation setting for pantools +odb: "primates_odb10" #ODB database for busco +kraken2_nt: "/path/to/kraken2/nt" #kraken2 database +OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs # A section to define groups of accessions for plotting together in report. This # list is order sensitive and will only be used for QC. @@ -87,7 +87,7 @@ set: - HG004 # General settings; only change if you know what you are doing -jvm: "-Xmx100g -Xms100g" #java virtual machine settings -tmpdir: "/dev/shm/tmp_human" #temporary directory for fast IO -nucmer_maxgap: 1000 #maximum gap size for nucmer -nucmer_minmatch: 1000 #minimum match size for nucmer +jvm: "-Xmx100g -Xms100g" #java virtual machine settings +tmpdir: "/dev/shm/tmp_human" #temporary directory for fast IO +nucmer_maxgap: 1000 #maximum gap size for nucmer +nucmer_minmatch: 1000 #minimum match size for nucmer diff --git a/workflow/envs/flye.yaml b/workflow/envs/flye.yaml new file mode 100644 index 0000000..315916f --- /dev/null +++ b/workflow/envs/flye.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - flye=2.9.5 diff --git a/workflow/rules/1.assembly/01.assembly.smk b/workflow/rules/1.assembly/01.assembly.smk index 9f7be80..9566713 100644 --- a/workflow/rules/1.assembly/01.assembly.smk +++ b/workflow/rules/1.assembly/01.assembly.smk @@ -250,3 +250,124 @@ rule verkko_rename_fasta: "results/benchmarks/1.assembly/verkko_{ext}_rename_fasta/{asmname}.txt" shell: "cp {input} {output} &> {log}" + +rule flye: + input: + hifi = get_hifi, + reference_genome = config["reference_genomes"][get_reference_id(wildcards.asmname)]["genome"], + output: + "results/{asmname}/1.assembly/01.flye_hifi_only/assembly.fasta", + log: + "results/logs/1.assembly/flye_hifi_only/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/flye_hifi_only/{asmname}.txt" + threads: + min(max(workflow.cores - 1, 1), 50) + conda: + "../../envs/flye.yaml" + shell: + """ + ( + GENOMESIZE=$(grep -vE '^>' {input.reference_genome} | tr -d '\n' | wc -c) + flye --genome-size $GENOMESIZE --threads {threads} --out-dir $(dirname {output}) --pacbio-hifi {input.hifi} + ) &> {log} + """ + +rule flye_with_ont: + input: + hifi = get_hifi, + ont = get_ont, + reference_genome = config["reference_genomes"][get_reference_id(wildcards.asmname)]["genome"], + output: + "results/{asmname}/1.assembly/01.flye_hifi_and_ont/assembly.fasta", + log: + "results/logs/1.assembly/flye_hifi_and_ont/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/flye_hifi_and_ont/{asmname}.txt" + threads: + min(max(workflow.cores - 1, 1), 50) + conda: + "../../envs/flye.yaml" + shell: + """ + ( + GENOMESIZE=$(grep -vE '^>' {input.reference_genome} | tr -d '\n' | wc -c) + flye --genome-size $GENOMESIZE --threads {threads} --out-dir $(dirname {output}) --pacbio-hifi {input.hifi} --nano-raw {input.ont} + ) &> {log} + """ + +rule flye_assembly_minimap2: + input: + hifi=get_hifi, + assembly="results/{asmname}/1.assembly/01.flye_{ext}/assembly.fasta", + output: + "results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam", + log: + "results/logs/1.assembly/flye_{ext}_assembly_minimap2/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/flye_{ext}_assembly_minimap2/{asmname}.txt" + threads: + min(max(workflow.cores - 1,1),50) + conda: + "../../envs/minimap2.yaml" + shell: + "(minimap2 -ax map-hifi -t {threads} {input.assembly} {input.hifi} | samtools sort -@ $(({threads}-1)) > {output}) 2> {log}" + +rule flye_assembly_minimap2_index: + input: + bam="results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam", + output: + "results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam.bai", + log: + "results/logs/1.assembly/flye_{ext}_assembly_minimap2_index/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/flye_{ext}_assembly_minimap2_index/{asmname}.txt" + threads: + min(max(workflow.cores - 1,1),50) + conda: + "../../envs/samtools.yaml" + shell: + "samtools index -@ $(({threads}-1)) {input} &> {log}" + +rule flye_hapdup: + input: + bam="results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam", + bai="results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam.bai", + assembly="results/{asmname}/1.assembly/01.flye_{ext}/assembly.fasta", + output: + "results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hapdup_dual_1.fasta", + "results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hapdup_dual_2.fasta", + "results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/phased_blocks_hp1.bed", + "results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/phased_blocks_hp2.bed", + "results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hapdup_phased_1.fasta", + "results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hapdup_phased_2.fasta", + log: + "results/logs/1.assembly/flye_{ext}_hapdup/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/flye_{ext}_hapdup/{asmname}.txt" + threads: + min(max(workflow.cores - 1,1),50) + container: + "docker://mkolmogo/hapdup:0.12" + shell: + "hapdup --threads {threads} --bam {input.bam} --rtype hifi --assembly {input.assembly} --out-dir $(dirname {output[0]}) &> {log}" + +def get_flye_output(wildcards): + if get_haplotypes(wildcards) == 1: + return "results/{asmname}/1.assembly/01.flye_{ext}/assembly.fasta" + else: + haplotype = int(wildcards.asmname[-1]) + asmname = get_clean_accession_id(wildcards.asmname) + return f"results/{asmname}/1.assembly/01.flye_{{ext}}/assembly_hapdup/hapdup_dual_{haplotype}.fasta" #using dual output here instead of phased, since we would like to keep contiguity (so phase switching may occur) + +rule flye_rename_fasta: + input: + get_flye_output, + output: + "results/{asmname}/1.assembly/01.flye_{ext}/{asmname}.fa", + log: + "results/logs/1.assembly/flye_{ext}_rename_fasta/{asmname}.log" + benchmark: + "results/benchmarks/1.assembly/flye_{ext}_rename_fasta/{asmname}.txt" + shell: + "cp {input} {output} &> {log}" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 2d24584..c27a417 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -16,8 +16,8 @@ properties: assembler: type: string - description: assembler to use; supported assemblers are hifiasm, verkko - pattern: "^(hifiasm|verkko)$" + description: assembler to use; supported assemblers are hifiasm, verkko, flye + pattern: "^(hifiasm|verkko|flye)$" default: "hifiasm" scaffolder: