diff --git a/config/config.yaml b/config/config.yaml index 94e098e..a44368f 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -4,15 +4,6 @@ # a reasonable value might be 4, or 8. cpu_cores: 4 -# Used for scatter/gather processing, corresponds with the number of families within the taxon defined in -# fasta_filter that have records for the specified marker. In practice, this means that the input BCDM TSV -# file has to have exactly this many distinct (not empty) values for the `family` column where the column -# named under fasta_filter.filter_level has has value fasta_filter.filter_name (e.g. the `order` must be -# `Odonata`. TODO: make this so that it is done automatically from the data. At present, this needs to be -# calculated by the user from the input file, e.g. by first making the sqlite database, or by grepping the -# TSV file somehow. -nfamilies: 63 - # Number of outgroups to include in each family-level analysis. Minimum is 2. outgroups: 3 @@ -46,8 +37,13 @@ datatype: NT # Choose which records to use from the database for the pipeline. filter_name only takes one name, so does filter level. # filter levels: class, order, family, genus, all (no filter) fasta_filter: + # max level: family filter_level: kingdom filter_name: Animalia + # Maximum number of sequences per fasta + # if above, fasta files are generated not at the family level but at lower rank i.e., genus or, if still + # too many sequences, at species level + max_fasta_seq: 200 name: phylogeny @@ -61,6 +57,6 @@ file_names: bold_tsv: resources/BOLD_Public.21-Jun-2024.curated.NL.tsv open_tre: resources/opentree/opentree14.9_tree/labelled_supertree/labelled_supertree.tre hmm: resources/hmm/COI-5P.hmm - fasta_dir: results/fasta/family + fasta_dir: results/fasta/ blast_dir: results/blast diff --git a/results/fasta/family/README.md b/results/fasta/family/README.md deleted file mode 100644 index 00f60aa..0000000 --- a/results/fasta/family/README.md +++ /dev/null @@ -1 +0,0 @@ -This is a placeholder for a folder structure where FASTA files, alignments, and trees are written as intermediate results. diff --git a/workflow/Snakefile b/workflow/Snakefile index da19c01..8955797 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,3 +1,5 @@ +import os +from pathlib import Path configfile: "config/config.yaml" # TODO refactor the snakefile so that all dependencies are managed by a containerized mamba. @@ -5,9 +7,31 @@ configfile: "config/config.yaml" # container: "docker://condaforge/mambaforge:23.1.0-1" +wildcard_constraints: + taxon=r"\w+" + +def get_all_taxon(wildcards): + checkpoint_output = checkpoints.family_fasta.get().output[0] + taxon_dir = os.path.join(Path(checkpoint_output).parent, "taxon") + taxon = [ + t for t in os.listdir(taxon_dir) + if os.path.isdir(os.path.join(taxon_dir, t)) + and os.stat(os.path.join(taxon_dir, t, "unaligned.fa")).st_size != 0 # ignore empty files + and "{" not in t ] # ignore one file named "Erebidae-Plecoptera{Genus_moth}", it breaks the wildcard + return taxon + +def get_prep_raxml_backbone_input(wildcards): + return expand( + "results/fasta/taxon/{{taxon}}/exemplars.fa".format(config['datatype']), + taxon=get_all_taxon(wildcards) # TODO remove the limit + ) + rule all: input: - "results/grafted.tre" + # "results/raxml/Cercopithecidae/Cercopithecidae_{}.raxml.log".format(config['datatype']) + # get_all_families + "results/fasta/raxml-ready.fa.raxml.bestTree.rooted" + # Creates and populates a SQLite database with filtered sequence records. # Uses BOLD dump TSV as defined in config file @@ -82,25 +106,21 @@ rule megatree_loader: rm {params.tempsql} {params.tempdb} """ -# TODO either this is updated dynamically or scattergather is abandoned for dynamic() -scattergather: - split = config["nfamilies"] # Exports unaligned BIN sequences for each family within the total set. This task is parallelized as many times as # specified by config["nfamilies"]. During this task, the longest raw sequence within each BIN is selected. -rule family_fasta: +checkpoint family_fasta: input: rules.map_opentol.output output: - fastas=scatter.split("results/fasta/family/{scatteritem}/unaligned.fa"), - status=f'{config["file_names"]["fasta_dir"]}/family_fasta.ok' + fasta_tsv="results/fasta/taxon_fasta.tsv", params: log_level=config['log_level'], fasta_dir=config["file_names"]["fasta_dir"], filter_level=config["fasta_filter"]["filter_level"], filter_name=config["fasta_filter"]["filter_name"], + max_seq_fasta=config["fasta_filter"]["max_fasta_seq"], marker=config["marker"], db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]), - chunks=config["nfamilies"], conda: "envs/family_fasta.yml" log: "logs/family_fasta/family_fasta.log" benchmark: @@ -112,10 +132,9 @@ rule family_fasta: -f {params.fasta_dir} \ -l {params.filter_level} \ -n {params.filter_name} \ - -c {params.chunks} \ + -L {params.max_seq_fasta} \ -m {params.marker} \ -v {params.log_level} 2> {log} - touch {output.status} """ # Creates a local BLAST database from the exported sequences that have OTT IDs. Later on, this database will be used @@ -123,10 +142,9 @@ rule family_fasta: # hopefully mean that even monotypic families will have a constraint tree with >= 3 tips so all chunks can be treated # identically. rule makeblastdb: - input: rules.family_fasta.output.status + input: rules.family_fasta.output.fasta_tsv output: f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.nsq' params: - chunks=config["nfamilies"], fasta_dir=config["file_names"]["fasta_dir"], concatenated=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.fa', blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}' @@ -137,7 +155,7 @@ rule makeblastdb: shell: """ sh workflow/scripts/makeblastdb.sh \ - {params.blastdb} {params.fasta_dir} {params.chunks} {params.concatenated} 2> {log} + {params.blastdb} {params.fasta_dir} {params.concatenated} 2> {log} """ @@ -146,16 +164,16 @@ rule makeblastdb: # hits across all queries. rule get_outgroups: input: - unaligned = "results/fasta/family/{scatteritem}/unaligned.fa", + unaligned = "results/fasta/taxon/{taxon}/unaligned.fa", makeblastdb = rules.makeblastdb.output - output: "results/fasta/family/{scatteritem}/outgroups.fa", + output: "results/fasta/taxon/{taxon}/outgroups.fa", params: outgroups = config["outgroups"], blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}' conda: "envs/blast.yml" - log: "logs/get_outgroups/get_outgroups-{scatteritem}.log" + log: "logs/get_outgroups/get_outgroups-{taxon}.log" benchmark: - "benchmarks/get_outgroups/get_outgroups-{scatteritem}.benchmark.txt" + "benchmarks/get_outgroups/get_outgroups-{taxon}.benchmark.txt" shell: """ sh workflow/scripts/get_outgroups.sh {params.blastdb} {params.outgroups} {input.unaligned} {output} 2> {log} @@ -166,16 +184,16 @@ rule get_outgroups: # service endpoint from OpenToL. The new implementation appears to achieve better coverage. rule family_constraint: input: - unaligned = "results/fasta/family/{scatteritem}/unaligned.fa", - outgroups = "results/fasta/family/{scatteritem}/outgroups.fa" - output: "results/fasta/family/{scatteritem}/constraint.tre" + unaligned = "results/fasta/taxon/{taxon}/unaligned.fa", + outgroups = "results/fasta/taxon/{taxon}/outgroups.fa" + output: "results/fasta/taxon/{taxon}/constraint.tre" params: log_level=config['log_level'], db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]) conda: "envs/family_constraint.yml" - log: "logs/family_constraint/family_constraint-{scatteritem}.log" + log: "logs/family_constraint/family_constraint-{taxon}.log" benchmark: - "benchmarks/family_constraint/family_constraint-{scatteritem}.benchmark.txt" + "benchmarks/family_constraint/family_constraint-{taxon}.benchmark.txt" shell: """ python workflow/scripts/family_constraint.py \ @@ -190,17 +208,17 @@ rule family_constraint: # is not needed at all because so far 0 revcom sequences were observed in BOLD. rule msa_hmm: input: - ingroup="results/fasta/family/{scatteritem}/unaligned.fa", - outgroup="results/fasta/family/{scatteritem}/outgroups.fa" - output: "results/fasta/family/{scatteritem}/aligned.fa" + ingroup="results/fasta/taxon/{taxon}/unaligned.fa", + outgroup="results/fasta/taxon/{taxon}/outgroups.fa" + output: "results/fasta/taxon/{taxon}/aligned.fa" params: log_level=config['log_level'], hmm_file=config['file_names']['hmm'], db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]) conda: "envs/msa_hmm.yml" - log: "logs/msa_hmm/msa_hmm-{scatteritem}.log" + log: "logs/msa_hmm/msa_hmm-{taxon}.log" benchmark: - "benchmarks/msa_hmm/msa_hmm-{scatteritem}.benchmark.txt" + "benchmarks/msa_hmm/msa_hmm-{taxon}.benchmark.txt" shell: """ python workflow/scripts/msa_hmm.py \ @@ -221,17 +239,17 @@ rule msa_hmm: # criterion should be that the candidate outgroups are in the OpenToL. rule prep_raxml: input: - alignment="results/fasta/family/{scatteritem}/aligned.fa", - tree="results/fasta/family/{scatteritem}/constraint.tre" + alignment="results/fasta/taxon/{taxon}/aligned.fa", + tree="results/fasta/taxon/{taxon}/constraint.tre" output: - tree="results/fasta/family/{scatteritem}/remapped.tre" + tree="results/fasta/taxon/{taxon}/remapped.tre" params: db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]), log_level=config['log_level'] conda: "envs/prep_raxml.yml" - log: "logs/prep_raxml/prep_raxml-{scatteritem}.log" + log: "logs/prep_raxml/prep_raxml-{taxon}.log" benchmark: - "benchmarks/prep_raxml/prep_raxml-{scatteritem}.benchmark.txt" + "benchmarks/prep_raxml/prep_raxml-{taxon}.benchmark.txt" shell: """ python workflow/scripts/prep_raxml.py \ @@ -251,16 +269,16 @@ rule prep_raxml: # by trapping raxml-ng errors and then re-running without the constraint. rule run_raxml: input: - alignment = "results/fasta/family/{scatteritem}/aligned.fa", - tree = "results/fasta/family/{scatteritem}/remapped.tre" + alignment = "results/fasta/taxon/{taxon}/aligned.fa", + tree = "results/fasta/taxon/{taxon}/remapped.tre" output: - tree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree" + tree = "results/fasta/taxon/{taxon}/aligned.fa.raxml.bestTree" params: model = config['model'], num_outgroups = config['outgroups'] - log: "logs/run_raxml/run_raxml-{scatteritem}.log" + log: "logs/run_raxml/run_raxml-{taxon}.log" benchmark: - "benchmarks/run_raxml/run_raxml-{scatteritem}.benchmark.txt" + "benchmarks/run_raxml/run_raxml-{taxon}.benchmark.txt" conda: "envs/raxml.yml" shell: """ @@ -282,7 +300,7 @@ rule run_raxml: --tree-constraint {input.tree} \ --workers 1 \ --blopt nr_safe \ - --search >> {log} 2>>&1 + --search >> {log} 2>&1 elif [ "$TAXON_COUNT" -ge 5 ]; then # If no constraint tree, run raxml-ng without it echo "Running RAxML-NG without constraint tree" > {log} @@ -292,7 +310,7 @@ rule run_raxml: --model {params.model} \ --workers 1 \ --blopt nr_safe \ - --search >> {log} 2>>&1 + --search >> {log} 2>&1 else # Skip <5 taxa MSAs and copy input tree to output tree echo "Skipping <5 taxa MSA" > {log} 2>&1 @@ -305,17 +323,17 @@ rule run_raxml: # clade members and then rooting on that bipartition branch. Subsequently, the outgroup taxa are removed. rule reroot_raxml_output: input: - tree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree", - constraint = "results/fasta/family/{scatteritem}/remapped.tre", - alignment = "results/fasta/family/{scatteritem}/aligned.fa" + tree = "results/fasta/taxon/{taxon}/aligned.fa.raxml.bestTree", + constraint = "results/fasta/taxon/{taxon}/remapped.tre", + alignment = "results/fasta/taxon/{taxon}/aligned.fa" output: - outtree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree.rooted" + outtree = "results/fasta/taxon/{taxon}/aligned.fa.raxml.bestTree.rooted" params: log_level = config['log_level'], num_outgroups = config['outgroups'] - log: "logs/reroot_raxml_output/reroot_raxml_output-{scatteritem}.log" + log: "logs/reroot_raxml_output/reroot_raxml_output-{taxon}.log" benchmark: - "benchmarks/reroot_raxml_output/reroot_raxml_output-{scatteritem}.benchmark.txt" + "benchmarks/reroot_raxml_output/reroot_raxml_output-{taxon}.benchmark.txt" conda: "envs/reroot_backbone.yml" shell: """ @@ -340,15 +358,15 @@ rule reroot_raxml_output: # to those optimized freely on a total tree. rule choose_exemplars: input: - alignment = "results/fasta/family/{scatteritem}/aligned.fa", - tree= "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree.rooted" - output: "results/fasta/family/{scatteritem}/exemplars.fa" + alignment = "results/fasta/taxon/{taxon}/aligned.fa", + tree= "results/fasta/taxon/{taxon}/aligned.fa.raxml.bestTree.rooted" + output: "results/fasta/taxon/{taxon}/exemplars.fa" params: log_level = config['log_level'], strategy = 'median' - log: "logs/choose_exemplars/choose_exemplars-{scatteritem}.log" + log: "logs/choose_exemplars/choose_exemplars-{taxon}.log" benchmark: - "benchmarks/choose_exemplars/choose_exemplars-{scatteritem}.benchmark.txt" + "benchmarks/choose_exemplars/choose_exemplars-{taxon}.benchmark.txt" conda: "envs/choose_exemplars.yaml" shell: """ @@ -371,7 +389,7 @@ rule choose_exemplars: # data on the command line, awk turns it into FASTA. rule prep_raxml_backbone: input: - fastas=gather.split("results/fasta/family/{scatteritem}/exemplars.fa"), + fastas=get_prep_raxml_backbone_input, opentol='results/databases/megatree_loader.ok' output: fasta="results/fasta/raxml-ready.fa", @@ -470,7 +488,6 @@ rule graft_clades: tree = "results/grafted.tre" params: log_level = config['log_level'], - nfamilies = config["nfamilies"] log: "logs/graft_clades/graft_clades.log" benchmark: "benchmarks/graft_clades.benchmark.txt" @@ -482,6 +499,5 @@ rule graft_clades: -f {input.clades} \ -e {input.extinct} \ -o {output.tree} \ - -n {params.nfamilies} \ -v {params.log_level} 2> {log} """ diff --git a/workflow/scripts/create_database.py b/workflow/scripts/create_database.py index 3ed7e12..cc00d56 100644 --- a/workflow/scripts/create_database.py +++ b/workflow/scripts/create_database.py @@ -1,3 +1,4 @@ +import os.path import sqlite3 import util import argparse @@ -126,7 +127,7 @@ class TEXT NOT NULL, family TEXT NOT NULL, subfamily TEXT NOT NULL, genus TEXT NOT NULL, - species TEXT NOT NULL, + species TEXT NOT NULL, bin_uri TEXT NOT NULL, opentol_id INTEGER, UNIQUE(kingdom, phylum, class, "order", family, subfamily, genus, species, bin_uri)); @@ -200,10 +201,17 @@ def update_fk(): # Instantiate logger logger = util.get_formatted_logger('create_database', args.verbosity) + if os.path.isfile(args.outdb): + os.unlink(args.outdb) + # Connect to the database logger.info('Going to connect to database') connection = sqlite3.connect(args.outdb) database_cursor = connection.cursor() + database_cursor.execute('pragma journal_mode=OFF') + database_cursor.execute('PRAGMA synchronous=OFF') + database_cursor.execute('PRAGMA cache_size=100000') + database_cursor.execute('PRAGMA temp_store = MEMORY') # Create database tables create_taxon_table('taxon') diff --git a/workflow/scripts/family_fasta.py b/workflow/scripts/family_fasta.py index c567e57..7ec41d5 100644 --- a/workflow/scripts/family_fasta.py +++ b/workflow/scripts/family_fasta.py @@ -4,7 +4,9 @@ import pandas as pd import argparse import util +from pathlib import Path +levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'subfamily', 'genus', 'all'] """ This script, `family_fasta.py`, is responsible for generating FASTA files for each family of a specified higher taxon @@ -35,14 +37,14 @@ def get_family_bins(q, conn): """ # Check if filter_level in config.yaml is usable - if q['level'].lower() in ['kingdom', 'phylum', 'class', 'order', 'family', 'subfamily', 'genus', 'all']: + if q['level'].lower() in levels: # Select all distinct family names that match config.yaml filters level = q['level'] name = q['name'] marker_code = q['marker_code'] sql = f''' - SELECT DISTINCT family, bin_uri + SELECT DISTINCT family, genus, species, bin_uri FROM barcode WHERE marker_code = '{marker_code}' AND "{level}" = '{name}' @@ -61,7 +63,7 @@ def get_family_bins(q, conn): return fam -def write_bin(q, conn, fh): +def write_bin(q, conn, outfile): """ Writes the longest sequence for a BIN to file :param q: query object @@ -78,7 +80,7 @@ def write_bin(q, conn, fh): JOIN taxon t ON b.taxon_id = t.taxon_id WHERE t."{q["level"]}" = "{q["name"]}" AND - t.family = "{q["family"]}" AND + t."{q["rank"]}" = "{q["taxon"]}" AND t.bin_uri = "{q["bin_uri"]}" AND b.marker_code = "{q["marker_code"]}" AND t.species IS NOT NULL AND @@ -90,13 +92,14 @@ def write_bin(q, conn, fh): famseq = pd.read_sql_query(query, conn) # Append to file handle fh - for _, row in famseq.iterrows(): - defline = f'>{row["barcode_id"]}|ott{row["opentol_id"]}|{row["processid"]}|{row["bin_uri"]}|{row["species"]}\n' - fh.write(defline) + with open(outfile, "w") as fh: + for _, row in famseq.iterrows(): + defline = f'>{row["barcode_id"]}|ott{row["opentol_id"]}|{row["processid"]}|{row["bin_uri"]}|{row["species"]}\n' + fh.write(defline) - # Strip non-ACGT characters (dashes, esp.) because hmmer chokes on them - seq = row['nuc'].replace('-', '') + '\n' - fh.write(seq) + # Strip non-ACGT characters (dashes, esp.) because hmmer chokes on them + seq = row['nuc'].replace('-', '') + '\n' + fh.write(seq) if __name__ == '__main__': @@ -106,15 +109,28 @@ def write_bin(q, conn, fh): parser.add_argument('-f', '--fasta_dir', required=True, help='Directory to write FASTA files to') parser.add_argument('-l', '--level', required=True, help='Taxonomic level to filter (e.g. order)') parser.add_argument('-n', '--name', required=True, help='Taxon name to filter (e.g. Primates)') - parser.add_argument('-c', '--chunks', required=True, help="Number of chunks (families) to write to file") + parser.add_argument('-L', '--limit', required=True, type=int, help='Fasta sequence limit, switch to lower rank if above (e.g. 200)') parser.add_argument('-m', '--marker', required=True, help='Marker code, e.g. COI-5P') parser.add_argument('-v', '--verbosity', required=True, help='Log level (e.g. DEBUG)') args = parser.parse_args() database_file = args.database + level = args.level.lower() + if level not in levels: + raise Exception(f"Filter level {level} from config file does not exists as a column in the database") + + if levels.index(level) > levels.index("family"): + raise Exception("Level filter value must be 'family' or higher rank") + # Configure logger logger = util.get_formatted_logger('family_fasta', args.verbosity) + try: + os.makedirs(args.fasta_dir, exist_ok=True) + except OSError as error: + logger.error(error) + exit(1) + # Connect to the database (creates a new file if it doesn't exist) logger.info(f"Going to connect to database {args.database}") connection = sqlite3.connect(args.database) @@ -128,29 +144,68 @@ def write_bin(q, conn, fh): } df = get_family_bins(query, connection) - # Iterate over distinct families - index = 1 - for family in df['family'].unique(): - logger.info(f"Writing {family}") + def write_fasta(query, rank, taxon, family_bin_uris, higher_ranks=None): + logger.info(f"Writing {taxon} ({rank})") # Make directory and open file handle - subdir = os.path.join(args.fasta_dir, f"{index}-of-{args.chunks}") + align_file = os.path.join(args.fasta_dir, "taxon", taxon, "unaligned.fa") try: - os.mkdir(subdir) + os.makedirs(Path(align_file).parent, exist_ok=True) except OSError as error: - logger.warning(error) - - with open(os.path.join(subdir, 'unaligned.fa'), 'w') as handle: - - # Iterate over bins in family + logger.error(error) + exit(1) + + # Iterate over bins in family + for bin_uri in family_bin_uris: + logger.debug(f"Writing {bin_uri}") + query['bin_uri'] = bin_uri + query["rank"] = rank + query["taxon"] = taxon + write_bin(query, connection, align_file) + + family_set = {} + genus_set = {} + # Iterate over distinct families + with open(os.path.join(args.fasta_dir, "taxon_fasta.tsv"), "w") as fw: + unique_families = df['family'].unique() + split_families = [] + for family in unique_families: family_bin_uris = df[df['family'] == family]['bin_uri'].unique() - for bin_uri in family_bin_uris: - logger.debug(f"Writing {bin_uri}") - query['bin_uri'] = bin_uri - query['family'] = family - write_bin(query, connection, handle) - - index += 1 + if len(family_bin_uris) > args.limit: + split_families.append(family) + fw.write(f"{family}\tfamily\t{len(family_bin_uris)}\tTrue\t\n") + continue + + write_fasta(query, "family", family, family_bin_uris) + fw.write(f"{family}\tfamily\t{len(family_bin_uris)}\tFalse\t\n") + + split_genera = [] + if split_families: + for family in split_families: + unique_genera = df[df['family'] == family]['genus'].unique() + for genus in unique_genera: + if not genus: + continue + genus_bin_uris = df[(df['family'] == family) & (df['genus'] == genus)]['bin_uri'].unique() + if len(genus_bin_uris) > args.limit: + split_genera.append(genus) + fw.write(f"{genus}\tgenus\t{len(genus_bin_uris)}\tTrue\t{family}\n") + continue + + write_fasta(query, "genus", genus, genus_bin_uris) + fw.write(f"{genus}\tgenus\t{len(genus_bin_uris)}\tFalse\t{family}\n") + + if split_genera: + for genus in split_genera: + if not genus: + continue + unique_species = df[df['genus'] == genus]['species'].unique() + for species in unique_species: + species_bin_uris = df[(df['genus'] == genus) & (df['species'] == species)]['bin_uri'].unique() + if len(species_bin_uris) > args.limit: + raise NotImplementedError("This is impossible!") + write_fasta(query, "species", species, species_bin_uris) + fw.write(f"{species}\tspecies\t{len(species_bin_uris)}\tFalse\t{genus}\n") # Close the connection connection.close() diff --git a/workflow/scripts/makeblastdb.sh b/workflow/scripts/makeblastdb.sh index 3267d49..583e840 100644 --- a/workflow/scripts/makeblastdb.sh +++ b/workflow/scripts/makeblastdb.sh @@ -1,7 +1,6 @@ DATABASE=$1 FASTADIR=$2 -ITERATIONS=$3 -TMP=$4 +TMP=$3 # This shell script, `makeblastdb.sh`, is responsible for creating a BLAST database from FASTA files generated by # `family_fasta.py`. The script performs the following steps: @@ -20,10 +19,10 @@ TMP=$4 # as a shell command in the rule `makeblastdb`. # doing this sequentially to avoid race conditions in BLAST indexing -for i in $(seq 1 $ITERATIONS); do - - # using the output from family_fasta - INFILE=${FASTADIR}/${i}-of-${ITERATIONS}/unaligned.fa +i=0 +for TAXON in $(ls -d $FASTADIR/taxon/*); do + i+=1 + INFILE=${TAXON}/unaligned.fa # only keep records with ott IDs, reformat the headers to retain the process ID, write to $TMP # start new file on first iteration, then append diff --git a/workflow/scripts/map_opentol.py b/workflow/scripts/map_opentol.py index 9847601..e3cdbad 100644 --- a/workflow/scripts/map_opentol.py +++ b/workflow/scripts/map_opentol.py @@ -133,6 +133,10 @@ def postprocess_db(): logger.info(f'Going to connect to database {args.database}') conn = sqlite3.connect(args.database) cursor = conn.cursor() + cursor.execute('pragma journal_mode=OFF') + cursor.execute('PRAGMA synchronous=OFF') + cursor.execute('PRAGMA cache_size=100000') + cursor.execute('PRAGMA temp_store = MEMORY') # Infer taxonomic context from marker name if args.marker == "COI-5P":