Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ All notable changes to MoGAAAP will be documented in this file.
### Added
- Add wrapper script to initialise, configure and run MoGAAAP (#80).
- Add option to use the pipeline for already existing assemblies (#83).
- Add flye as an alternative assembler (#93).

### Changed
- Update sample sheet to allow using the pipeline for already existing assemblies (#83).
Expand Down
2 changes: 1 addition & 1 deletion config/example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
samples: "config/example.tsv"

# Assembly settings
assembler: "hifiasm" #supported assemblers: hifiasm, verkko
assembler: "hifiasm" #supported assemblers: hifiasm, verkko, flye

# Scaffolding settings
scaffolder: "ntjoin" #supported scaffolders: ntjoin, ragtag
Expand Down
22 changes: 11 additions & 11 deletions config/examples/arabidopsis.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
samples: "config/samples.tsv"

# Assembly settings
assembler: "hifiasm" #supported assemblers: hifiasm, verkko
assembler: "hifiasm" #supported assemblers: hifiasm, verkko, flye

# All filtering and scaffolding parameters
min_contig_len: 10000
Expand Down Expand Up @@ -53,12 +53,12 @@ helixer_model: "land_plant" #choose between land_plant, vertebrate, invertebrate
helixer_max_gene_length: 64152

# Quality assessment settings
k_qa: 21 #optimal k-mer size for quality assessment
gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx
pantools_grouping: 3 #grouping relaxation setting for pantools
odb: "brassicales_odb10" #ODB database for busco
kraken2_nt: "/path/to/kraken2/nt" #kraken2 database
OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs
k_qa: 21 #optimal k-mer size for quality assessment
gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx
pantools_grouping: 3 #grouping relaxation setting for pantools
odb: "brassicales_odb10" #ODB database for busco
kraken2_nt: "/path/to/kraken2/nt" #kraken2 database
OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs

# A section to define groups of accessions for plotting together in report. This
# list is order sensitive and will only be used for QC.
Expand Down Expand Up @@ -98,7 +98,7 @@ set:
- 45_meh_0

# General settings; only change if you know what you are doing
jvm: "-Xmx100g -Xms100g" #java virtual machine settings
tmpdir: "/dev/shm/tmp_arabidopsis" #temporary directory for fast IO
nucmer_maxgap: 1000 #maximum gap size for nucmer
nucmer_minmatch: 1000 #minimum match size for nucmer
jvm: "-Xmx100g -Xms100g" #java virtual machine settings
tmpdir: "/dev/shm/tmp_arabidopsis" #temporary directory for fast IO
nucmer_maxgap: 1000 #maximum gap size for nucmer
nucmer_minmatch: 1000 #minimum match size for nucmer
22 changes: 11 additions & 11 deletions config/examples/grape.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
samples: "config/samples.tsv"

# Assembly settings
assembler: "hifiasm" #supported assemblers: hifiasm, verkko
assembler: "hifiasm" #supported assemblers: hifiasm, verkko, flye

# All filtering and scaffolding parameters
min_contig_len: 10000
Expand Down Expand Up @@ -65,12 +65,12 @@ helixer_model: "land_plant" #choose between land_plant, vertebrate, invertebrate
helixer_max_gene_length: 64152

# Quality assessment settings
k_qa: 21 #optimal k-mer size for quality assessment
gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx
pantools_grouping: 3 #grouping relaxation setting for pantools
odb: "eudicots_odb10" #ODB database for busco
kraken2_nt: "/path/to/kraken2/nt" #kraken2 database
OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs
k_qa: 21 #optimal k-mer size for quality assessment
gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx
pantools_grouping: 3 #grouping relaxation setting for pantools
odb: "eudicots_odb10" #ODB database for busco
kraken2_nt: "/path/to/kraken2/nt" #kraken2 database
OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs

# A section to define groups of accessions for plotting together in report. This
# list is order sensitive and will only be used for QC.
Expand All @@ -84,7 +84,7 @@ set:
- Wolley

# General settings; only change if you know what you are doing
jvm: "-Xmx100g -Xms100g" #java virtual machine settings
tmpdir: "/dev/shm/tmp_grape" #temporary directory for fast IO
nucmer_maxgap: 1000 #maximum gap size for nucmer
nucmer_minmatch: 1000 #minimum match size for nucmer
jvm: "-Xmx100g -Xms100g" #java virtual machine settings
tmpdir: "/dev/shm/tmp_grape" #temporary directory for fast IO
nucmer_maxgap: 1000 #maximum gap size for nucmer
nucmer_minmatch: 1000 #minimum match size for nucmer
22 changes: 11 additions & 11 deletions config/examples/human.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
samples: "config/samples.tsv"

# Assembly settings
assembler: "hifiasm" #supported assemblers: hifiasm, verkko
assembler: "hifiasm" #supported assemblers: hifiasm, verkko, flye

# All filtering and scaffolding parameters
min_contig_len: 10000
Expand Down Expand Up @@ -71,12 +71,12 @@ helixer_model: "vertebrate" #choose between land_plant, vertebrate, invertebrate
helixer_max_gene_length: 64152

# Quality assessment settings
k_qa: 21 #optimal k-mer size for quality assessment
gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx
pantools_grouping: 3 #grouping relaxation setting for pantools
odb: "primates_odb10" #ODB database for busco
kraken2_nt: "/path/to/kraken2/nt" #kraken2 database
OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs
k_qa: 21 #optimal k-mer size for quality assessment
gxdb: "/local/fcs/gxdb" #path to gxdb for fcs-gx
pantools_grouping: 3 #grouping relaxation setting for pantools
odb: "primates_odb10" #ODB database for busco
kraken2_nt: "/path/to/kraken2/nt" #kraken2 database
OMAdb: "/path/to/oma_databases/LUCA.h5" #OMA database for orthologs

# A section to define groups of accessions for plotting together in report. This
# list is order sensitive and will only be used for QC.
Expand All @@ -87,7 +87,7 @@ set:
- HG004

# General settings; only change if you know what you are doing
jvm: "-Xmx100g -Xms100g" #java virtual machine settings
tmpdir: "/dev/shm/tmp_human" #temporary directory for fast IO
nucmer_maxgap: 1000 #maximum gap size for nucmer
nucmer_minmatch: 1000 #minimum match size for nucmer
jvm: "-Xmx100g -Xms100g" #java virtual machine settings
tmpdir: "/dev/shm/tmp_human" #temporary directory for fast IO
nucmer_maxgap: 1000 #maximum gap size for nucmer
nucmer_minmatch: 1000 #minimum match size for nucmer
5 changes: 5 additions & 0 deletions workflow/envs/flye.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- conda-forge
- bioconda
dependencies:
- flye=2.9.5
121 changes: 121 additions & 0 deletions workflow/rules/1.assembly/01.assembly.smk
Original file line number Diff line number Diff line change
Expand Up @@ -250,3 +250,124 @@ rule verkko_rename_fasta:
"results/benchmarks/1.assembly/verkko_{ext}_rename_fasta/{asmname}.txt"
shell:
"cp {input} {output} &> {log}"

rule flye:
input:
hifi = get_hifi,
reference_genome = config["reference_genomes"][get_reference_id(wildcards.asmname)]["genome"],
output:
"results/{asmname}/1.assembly/01.flye_hifi_only/assembly.fasta",
log:
"results/logs/1.assembly/flye_hifi_only/{asmname}.log"
benchmark:
"results/benchmarks/1.assembly/flye_hifi_only/{asmname}.txt"
threads:
min(max(workflow.cores - 1, 1), 50)
conda:
"../../envs/flye.yaml"
shell:
"""
(
GENOMESIZE=$(grep -vE '^>' {input.reference_genome} | tr -d '\n' | wc -c)
flye --genome-size $GENOMESIZE --threads {threads} --out-dir $(dirname {output}) --pacbio-hifi {input.hifi}
) &> {log}
"""

rule flye_with_ont:
input:
hifi = get_hifi,
ont = get_ont,
reference_genome = config["reference_genomes"][get_reference_id(wildcards.asmname)]["genome"],
output:
"results/{asmname}/1.assembly/01.flye_hifi_and_ont/assembly.fasta",
log:
"results/logs/1.assembly/flye_hifi_and_ont/{asmname}.log"
benchmark:
"results/benchmarks/1.assembly/flye_hifi_and_ont/{asmname}.txt"
threads:
min(max(workflow.cores - 1, 1), 50)
conda:
"../../envs/flye.yaml"
shell:
"""
(
GENOMESIZE=$(grep -vE '^>' {input.reference_genome} | tr -d '\n' | wc -c)
flye --genome-size $GENOMESIZE --threads {threads} --out-dir $(dirname {output}) --pacbio-hifi {input.hifi} --nano-raw {input.ont}
) &> {log}
"""

rule flye_assembly_minimap2:
input:
hifi=get_hifi,
assembly="results/{asmname}/1.assembly/01.flye_{ext}/assembly.fasta",
output:
"results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam",
log:
"results/logs/1.assembly/flye_{ext}_assembly_minimap2/{asmname}.log"
benchmark:
"results/benchmarks/1.assembly/flye_{ext}_assembly_minimap2/{asmname}.txt"
threads:
min(max(workflow.cores - 1,1),50)
conda:
"../../envs/minimap2.yaml"
shell:
"(minimap2 -ax map-hifi -t {threads} {input.assembly} {input.hifi} | samtools sort -@ $(({threads}-1)) > {output}) 2> {log}"

rule flye_assembly_minimap2_index:
input:
bam="results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam",
output:
"results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam.bai",
log:
"results/logs/1.assembly/flye_{ext}_assembly_minimap2_index/{asmname}.log"
benchmark:
"results/benchmarks/1.assembly/flye_{ext}_assembly_minimap2_index/{asmname}.txt"
threads:
min(max(workflow.cores - 1,1),50)
conda:
"../../envs/samtools.yaml"
shell:
"samtools index -@ $(({threads}-1)) {input} &> {log}"

rule flye_hapdup:
input:
bam="results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam",
bai="results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hifi_vs_assembly.bam.bai",
assembly="results/{asmname}/1.assembly/01.flye_{ext}/assembly.fasta",
output:
"results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hapdup_dual_1.fasta",
"results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hapdup_dual_2.fasta",
"results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/phased_blocks_hp1.bed",
"results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/phased_blocks_hp2.bed",
"results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hapdup_phased_1.fasta",
"results/{asmname}/1.assembly/01.flye_{ext}/assembly_hapdup/hapdup_phased_2.fasta",
log:
"results/logs/1.assembly/flye_{ext}_hapdup/{asmname}.log"
benchmark:
"results/benchmarks/1.assembly/flye_{ext}_hapdup/{asmname}.txt"
threads:
min(max(workflow.cores - 1,1),50)
container:
"docker://mkolmogo/hapdup:0.12"
shell:
"hapdup --threads {threads} --bam {input.bam} --rtype hifi --assembly {input.assembly} --out-dir $(dirname {output[0]}) &> {log}"

def get_flye_output(wildcards):
if get_haplotypes(wildcards) == 1:
return "results/{asmname}/1.assembly/01.flye_{ext}/assembly.fasta"
else:
haplotype = int(wildcards.asmname[-1])
asmname = get_clean_accession_id(wildcards.asmname)
return f"results/{asmname}/1.assembly/01.flye_{{ext}}/assembly_hapdup/hapdup_dual_{haplotype}.fasta" #using dual output here instead of phased, since we would like to keep contiguity (so phase switching may occur)

rule flye_rename_fasta:
input:
get_flye_output,
output:
"results/{asmname}/1.assembly/01.flye_{ext}/{asmname}.fa",
log:
"results/logs/1.assembly/flye_{ext}_rename_fasta/{asmname}.log"
benchmark:
"results/benchmarks/1.assembly/flye_{ext}_rename_fasta/{asmname}.txt"
shell:
"cp {input} {output} &> {log}"
4 changes: 2 additions & 2 deletions workflow/schemas/config.schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ properties:

assembler:
type: string
description: assembler to use; supported assemblers are hifiasm, verkko
pattern: "^(hifiasm|verkko)$"
description: assembler to use; supported assemblers are hifiasm, verkko, flye
pattern: "^(hifiasm|verkko|flye)$"
default: "hifiasm"

scaffolder:
Expand Down