diff --git a/config/config.yaml b/config/config.yaml
index 94e098e..a44368f 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -4,15 +4,6 @@
 # a reasonable value might be 4, or 8.
 cpu_cores: 4
 
-# Used for scatter/gather processing, corresponds with the number of families within the taxon defined in 
-# fasta_filter that have records for the specified marker. In practice, this means that the input BCDM TSV
-# file has to have exactly this many distinct (not empty) values for the `family` column where the column
-# named under fasta_filter.filter_level has has value fasta_filter.filter_name (e.g. the `order` must be
-# `Odonata`. TODO: make this so that it is done automatically from the data. At present, this needs to be 
-# calculated by the user from the input file, e.g. by first making the sqlite database, or by grepping the 
-# TSV file somehow.
-nfamilies: 63
-
 # Number of outgroups to include in each family-level analysis. Minimum is 2.
 outgroups: 3
 
@@ -46,8 +37,13 @@ datatype: NT
 # Choose which records to use from the database for the pipeline. filter_name only takes one name, so does filter level.
 # filter levels: class, order, family, genus, all (no filter)
 fasta_filter:
+  # max level: family
   filter_level: kingdom
   filter_name: Animalia
+  # Maximum number of sequences per fasta
+  # if above, fasta files are generated not at the family level but at lower rank i.e., genus or, if still
+  # too many sequences, at species level
+  max_fasta_seq: 200
 
 name: phylogeny
 
@@ -61,6 +57,6 @@ file_names:
   bold_tsv: resources/BOLD_Public.21-Jun-2024.curated.NL.tsv
   open_tre: resources/opentree/opentree14.9_tree/labelled_supertree/labelled_supertree.tre
   hmm: resources/hmm/COI-5P.hmm
-  fasta_dir: results/fasta/family
+  fasta_dir: results/fasta/
   blast_dir: results/blast
 
diff --git a/results/fasta/family/README.md b/results/fasta/family/README.md
deleted file mode 100644
index 00f60aa..0000000
--- a/results/fasta/family/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This is a placeholder for a folder structure where FASTA files, alignments, and trees are written as intermediate results.
diff --git a/workflow/Snakefile b/workflow/Snakefile
index da19c01..8955797 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -1,3 +1,5 @@
+import os
+from pathlib import Path
 configfile: "config/config.yaml"
 
 # TODO refactor the snakefile so that all dependencies are managed by a containerized mamba.
@@ -5,9 +7,31 @@ configfile: "config/config.yaml"
 # container: "docker://condaforge/mambaforge:23.1.0-1"
 
 
+wildcard_constraints:
+    taxon=r"\w+"
+
+def get_all_taxon(wildcards):
+    checkpoint_output = checkpoints.family_fasta.get().output[0]
+    taxon_dir = os.path.join(Path(checkpoint_output).parent, "taxon")
+    taxon = [
+        t for t in os.listdir(taxon_dir)
+        if os.path.isdir(os.path.join(taxon_dir, t))
+           and os.stat(os.path.join(taxon_dir, t, "unaligned.fa")).st_size != 0 # ignore empty files
+           and "{" not in t ]  # ignore one file named "Erebidae-Plecoptera{Genus_moth}", it breaks the wildcard
+    return taxon
+
+def get_prep_raxml_backbone_input(wildcards):
+    return expand(
+        "results/fasta/taxon/{{taxon}}/exemplars.fa".format(config['datatype']),
+        taxon=get_all_taxon(wildcards)  # TODO remove the limit
+    )
+
 rule all:
     input:
-        "results/grafted.tre"
+        # "results/raxml/Cercopithecidae/Cercopithecidae_{}.raxml.log".format(config['datatype'])
+        # get_all_families
+        "results/fasta/raxml-ready.fa.raxml.bestTree.rooted"
+
 
 # Creates and populates a SQLite database with filtered sequence records.
 # Uses BOLD dump TSV as defined in config file
@@ -82,25 +106,21 @@ rule megatree_loader:
         rm {params.tempsql} {params.tempdb}
         """
 
-# TODO either this is updated dynamically or scattergather is abandoned for dynamic()
-scattergather:
-  split = config["nfamilies"]
 
 # Exports unaligned BIN sequences for each family within the total set. This task is parallelized as many times as
 # specified by config["nfamilies"]. During this task, the longest raw sequence within each BIN is selected.
-rule family_fasta:
+checkpoint family_fasta:
     input: rules.map_opentol.output
     output:
-      fastas=scatter.split("results/fasta/family/{scatteritem}/unaligned.fa"),
-      status=f'{config["file_names"]["fasta_dir"]}/family_fasta.ok'
+      fasta_tsv="results/fasta/taxon_fasta.tsv",
     params:
       log_level=config['log_level'],
       fasta_dir=config["file_names"]["fasta_dir"],
       filter_level=config["fasta_filter"]["filter_level"],
       filter_name=config["fasta_filter"]["filter_name"],
+      max_seq_fasta=config["fasta_filter"]["max_fasta_seq"],
       marker=config["marker"],
       db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
-      chunks=config["nfamilies"],
     conda: "envs/family_fasta.yml"
     log: "logs/family_fasta/family_fasta.log"
     benchmark:
@@ -112,10 +132,9 @@ rule family_fasta:
             -f {params.fasta_dir} \
             -l {params.filter_level} \
             -n {params.filter_name} \
-            -c {params.chunks} \
+            -L {params.max_seq_fasta} \
             -m {params.marker} \
             -v {params.log_level} 2> {log}
-        touch {output.status}
         """
 
 # Creates a local BLAST database from the exported sequences that have OTT IDs. Later on, this database will be used
@@ -123,10 +142,9 @@ rule family_fasta:
 # hopefully mean that even monotypic families will have a constraint tree with >= 3 tips so all chunks can be treated
 # identically.
 rule makeblastdb:
-    input: rules.family_fasta.output.status
+    input: rules.family_fasta.output.fasta_tsv
     output: f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.nsq'
     params:
-      chunks=config["nfamilies"],
       fasta_dir=config["file_names"]["fasta_dir"],
       concatenated=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}.fa',
       blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
@@ -137,7 +155,7 @@ rule makeblastdb:
     shell:
         """
         sh workflow/scripts/makeblastdb.sh \
-            {params.blastdb} {params.fasta_dir} {params.chunks} {params.concatenated} 2> {log}
+            {params.blastdb} {params.fasta_dir} {params.concatenated} 2> {log}
         """
 
 
@@ -146,16 +164,16 @@ rule makeblastdb:
 # hits across all queries.
 rule get_outgroups:
     input:
-        unaligned = "results/fasta/family/{scatteritem}/unaligned.fa",
+        unaligned = "results/fasta/taxon/{taxon}/unaligned.fa",
         makeblastdb = rules.makeblastdb.output
-    output: "results/fasta/family/{scatteritem}/outgroups.fa",
+    output: "results/fasta/taxon/{taxon}/outgroups.fa",
     params:
         outgroups = config["outgroups"],
         blastdb=f'{config["file_names"]["blast_dir"]}/{config["blastdb"]}'
     conda: "envs/blast.yml"
-    log: "logs/get_outgroups/get_outgroups-{scatteritem}.log"
+    log: "logs/get_outgroups/get_outgroups-{taxon}.log"
     benchmark:
-        "benchmarks/get_outgroups/get_outgroups-{scatteritem}.benchmark.txt"
+        "benchmarks/get_outgroups/get_outgroups-{taxon}.benchmark.txt"
     shell:
         """
         sh workflow/scripts/get_outgroups.sh {params.blastdb} {params.outgroups} {input.unaligned} {output} 2> {log}
@@ -166,16 +184,16 @@ rule get_outgroups:
 # service endpoint from OpenToL. The new implementation appears to achieve better coverage.
 rule family_constraint:
     input:
-        unaligned = "results/fasta/family/{scatteritem}/unaligned.fa",
-        outgroups = "results/fasta/family/{scatteritem}/outgroups.fa"
-    output: "results/fasta/family/{scatteritem}/constraint.tre"
+        unaligned = "results/fasta/taxon/{taxon}/unaligned.fa",
+        outgroups = "results/fasta/taxon/{taxon}/outgroups.fa"
+    output: "results/fasta/taxon/{taxon}/constraint.tre"
     params:
         log_level=config['log_level'],
         db="results/databases/BOLD_{}_barcodes.db".format(config["marker"])
     conda: "envs/family_constraint.yml"
-    log: "logs/family_constraint/family_constraint-{scatteritem}.log"
+    log: "logs/family_constraint/family_constraint-{taxon}.log"
     benchmark:
-        "benchmarks/family_constraint/family_constraint-{scatteritem}.benchmark.txt"
+        "benchmarks/family_constraint/family_constraint-{taxon}.benchmark.txt"
     shell:
         """
         python workflow/scripts/family_constraint.py \
@@ -190,17 +208,17 @@ rule family_constraint:
 # is not needed at all because so far 0 revcom sequences were observed in BOLD.
 rule msa_hmm:
     input:
-        ingroup="results/fasta/family/{scatteritem}/unaligned.fa",
-        outgroup="results/fasta/family/{scatteritem}/outgroups.fa"
-    output: "results/fasta/family/{scatteritem}/aligned.fa"
+        ingroup="results/fasta/taxon/{taxon}/unaligned.fa",
+        outgroup="results/fasta/taxon/{taxon}/outgroups.fa"
+    output: "results/fasta/taxon/{taxon}/aligned.fa"
     params:
         log_level=config['log_level'],
         hmm_file=config['file_names']['hmm'],
         db="results/databases/BOLD_{}_barcodes.db".format(config["marker"])
     conda: "envs/msa_hmm.yml"
-    log: "logs/msa_hmm/msa_hmm-{scatteritem}.log"
+    log: "logs/msa_hmm/msa_hmm-{taxon}.log"
     benchmark:
-        "benchmarks/msa_hmm/msa_hmm-{scatteritem}.benchmark.txt"
+        "benchmarks/msa_hmm/msa_hmm-{taxon}.benchmark.txt"
     shell:
         """
         python workflow/scripts/msa_hmm.py \
@@ -221,17 +239,17 @@ rule msa_hmm:
 # criterion should be that the candidate outgroups are in the OpenToL.
 rule prep_raxml:
     input:
-        alignment="results/fasta/family/{scatteritem}/aligned.fa",
-        tree="results/fasta/family/{scatteritem}/constraint.tre"
+        alignment="results/fasta/taxon/{taxon}/aligned.fa",
+        tree="results/fasta/taxon/{taxon}/constraint.tre"
     output:
-        tree="results/fasta/family/{scatteritem}/remapped.tre"
+        tree="results/fasta/taxon/{taxon}/remapped.tre"
     params:
         db="results/databases/BOLD_{}_barcodes.db".format(config["marker"]),
         log_level=config['log_level']
     conda: "envs/prep_raxml.yml"
-    log: "logs/prep_raxml/prep_raxml-{scatteritem}.log"
+    log: "logs/prep_raxml/prep_raxml-{taxon}.log"
     benchmark:
-        "benchmarks/prep_raxml/prep_raxml-{scatteritem}.benchmark.txt"
+        "benchmarks/prep_raxml/prep_raxml-{taxon}.benchmark.txt"
     shell:
         """
         python workflow/scripts/prep_raxml.py \
@@ -251,16 +269,16 @@ rule prep_raxml:
 # by trapping raxml-ng errors and then re-running without the constraint.
 rule run_raxml:
     input:
-        alignment = "results/fasta/family/{scatteritem}/aligned.fa",
-        tree = "results/fasta/family/{scatteritem}/remapped.tre"
+        alignment = "results/fasta/taxon/{taxon}/aligned.fa",
+        tree = "results/fasta/taxon/{taxon}/remapped.tre"
     output:
-        tree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree"
+        tree = "results/fasta/taxon/{taxon}/aligned.fa.raxml.bestTree"
     params:
         model = config['model'],
         num_outgroups = config['outgroups']
-    log: "logs/run_raxml/run_raxml-{scatteritem}.log"
+    log: "logs/run_raxml/run_raxml-{taxon}.log"
     benchmark:
-        "benchmarks/run_raxml/run_raxml-{scatteritem}.benchmark.txt"
+        "benchmarks/run_raxml/run_raxml-{taxon}.benchmark.txt"
     conda: "envs/raxml.yml"
     shell:
         """
@@ -282,7 +300,7 @@ rule run_raxml:
             --tree-constraint {input.tree} \
             --workers 1 \
             --blopt nr_safe \
-            --search >> {log} 2>>&1
+            --search >> {log} 2>&1
         elif [ "$TAXON_COUNT" -ge 5 ]; then
             # If no constraint tree, run raxml-ng without it
             echo "Running RAxML-NG without constraint tree" > {log}
@@ -292,7 +310,7 @@ rule run_raxml:
             --model {params.model} \
             --workers 1 \
             --blopt nr_safe \
-            --search >> {log} 2>>&1
+            --search >> {log} 2>&1
         else
             # Skip <5 taxa MSAs and copy input tree to output tree
             echo "Skipping <5 taxa MSA" > {log} 2>&1
@@ -305,17 +323,17 @@ rule run_raxml:
 # clade members and then rooting on that bipartition branch. Subsequently, the outgroup taxa are removed.
 rule reroot_raxml_output:
     input:
-        tree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree",
-        constraint = "results/fasta/family/{scatteritem}/remapped.tre",
-        alignment = "results/fasta/family/{scatteritem}/aligned.fa"
+        tree = "results/fasta/taxon/{taxon}/aligned.fa.raxml.bestTree",
+        constraint = "results/fasta/taxon/{taxon}/remapped.tre",
+        alignment = "results/fasta/taxon/{taxon}/aligned.fa"
     output:
-        outtree = "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree.rooted"
+        outtree = "results/fasta/taxon/{taxon}/aligned.fa.raxml.bestTree.rooted"
     params:
         log_level = config['log_level'],
         num_outgroups = config['outgroups']
-    log: "logs/reroot_raxml_output/reroot_raxml_output-{scatteritem}.log"
+    log: "logs/reroot_raxml_output/reroot_raxml_output-{taxon}.log"
     benchmark:
-        "benchmarks/reroot_raxml_output/reroot_raxml_output-{scatteritem}.benchmark.txt"
+        "benchmarks/reroot_raxml_output/reroot_raxml_output-{taxon}.benchmark.txt"
     conda: "envs/reroot_backbone.yml"
     shell:
         """
@@ -340,15 +358,15 @@ rule reroot_raxml_output:
 # to those optimized freely on a total tree.
 rule choose_exemplars:
     input:
-        alignment = "results/fasta/family/{scatteritem}/aligned.fa",
-        tree= "results/fasta/family/{scatteritem}/aligned.fa.raxml.bestTree.rooted"
-    output: "results/fasta/family/{scatteritem}/exemplars.fa"
+        alignment = "results/fasta/taxon/{taxon}/aligned.fa",
+        tree= "results/fasta/taxon/{taxon}/aligned.fa.raxml.bestTree.rooted"
+    output: "results/fasta/taxon/{taxon}/exemplars.fa"
     params:
         log_level = config['log_level'],
         strategy = 'median'
-    log: "logs/choose_exemplars/choose_exemplars-{scatteritem}.log"
+    log: "logs/choose_exemplars/choose_exemplars-{taxon}.log"
     benchmark:
-        "benchmarks/choose_exemplars/choose_exemplars-{scatteritem}.benchmark.txt"
+        "benchmarks/choose_exemplars/choose_exemplars-{taxon}.benchmark.txt"
     conda: "envs/choose_exemplars.yaml"
     shell:
         """
@@ -371,7 +389,7 @@ rule choose_exemplars:
 # data on the command line, awk turns it into FASTA.
 rule prep_raxml_backbone:
     input:
-        fastas=gather.split("results/fasta/family/{scatteritem}/exemplars.fa"),
+        fastas=get_prep_raxml_backbone_input,
         opentol='results/databases/megatree_loader.ok'
     output:
         fasta="results/fasta/raxml-ready.fa",
@@ -470,7 +488,6 @@ rule graft_clades:
         tree = "results/grafted.tre"
     params:
         log_level = config['log_level'],
-        nfamilies = config["nfamilies"]
     log: "logs/graft_clades/graft_clades.log"
     benchmark:
         "benchmarks/graft_clades.benchmark.txt"
@@ -482,6 +499,5 @@ rule graft_clades:
             -f {input.clades} \
             -e {input.extinct} \
             -o {output.tree} \
-            -n {params.nfamilies} \
             -v {params.log_level} 2> {log}                  
         """
diff --git a/workflow/scripts/create_database.py b/workflow/scripts/create_database.py
index 3ed7e12..cc00d56 100644
--- a/workflow/scripts/create_database.py
+++ b/workflow/scripts/create_database.py
@@ -1,3 +1,4 @@
+import os.path
 import sqlite3
 import util
 import argparse
@@ -126,7 +127,7 @@ class TEXT NOT NULL,
             family TEXT NOT NULL,
             subfamily TEXT NOT NULL,
             genus TEXT NOT NULL,
-            species TEXT NOT NULL,            
+            species TEXT NOT NULL,
             bin_uri TEXT NOT NULL,
             opentol_id INTEGER,
             UNIQUE(kingdom, phylum, class, "order", family, subfamily, genus, species, bin_uri));
@@ -200,10 +201,17 @@ def update_fk():
     # Instantiate logger
     logger = util.get_formatted_logger('create_database', args.verbosity)
 
+    if os.path.isfile(args.outdb):
+        os.unlink(args.outdb)
+
     # Connect to the database
     logger.info('Going to connect to database')
     connection = sqlite3.connect(args.outdb)
     database_cursor = connection.cursor()
+    database_cursor.execute('pragma journal_mode=OFF')
+    database_cursor.execute('PRAGMA synchronous=OFF')
+    database_cursor.execute('PRAGMA cache_size=100000')
+    database_cursor.execute('PRAGMA temp_store = MEMORY')
 
     # Create database tables
     create_taxon_table('taxon')
diff --git a/workflow/scripts/family_fasta.py b/workflow/scripts/family_fasta.py
index c567e57..7ec41d5 100644
--- a/workflow/scripts/family_fasta.py
+++ b/workflow/scripts/family_fasta.py
@@ -4,7 +4,9 @@
 import pandas as pd
 import argparse
 import util
+from pathlib import Path
 
+levels = ['kingdom', 'phylum', 'class', 'order', 'family', 'subfamily', 'genus', 'all']
 
 """
 This script, `family_fasta.py`, is responsible for generating FASTA files for each family of a specified higher taxon 
@@ -35,14 +37,14 @@ def get_family_bins(q, conn):
     """
 
     # Check if filter_level in config.yaml is usable
-    if q['level'].lower() in ['kingdom', 'phylum', 'class', 'order', 'family', 'subfamily', 'genus', 'all']:
+    if q['level'].lower() in levels:
 
         # Select all distinct family names that match config.yaml filters
         level = q['level']
         name = q['name']
         marker_code = q['marker_code']
         sql = f'''
-            SELECT DISTINCT family, bin_uri
+            SELECT DISTINCT family, genus, species, bin_uri
             FROM barcode
             WHERE marker_code = '{marker_code}' 
               AND "{level}" = '{name}'
@@ -61,7 +63,7 @@ def get_family_bins(q, conn):
     return fam
 
 
-def write_bin(q, conn, fh):
+def write_bin(q, conn, outfile):
     """
     Writes the longest sequence for a BIN to file
     :param q: query object
@@ -78,7 +80,7 @@ def write_bin(q, conn, fh):
         JOIN taxon t ON b.taxon_id = t.taxon_id
         WHERE
             t."{q["level"]}" = "{q["name"]}" AND
-            t.family = "{q["family"]}" AND
+            t."{q["rank"]}" = "{q["taxon"]}" AND
             t.bin_uri = "{q["bin_uri"]}" AND
             b.marker_code = "{q["marker_code"]}" AND
             t.species IS NOT NULL AND
@@ -90,13 +92,14 @@ def write_bin(q, conn, fh):
     famseq = pd.read_sql_query(query, conn)
 
     # Append to file handle fh
-    for _, row in famseq.iterrows():
-        defline = f'>{row["barcode_id"]}|ott{row["opentol_id"]}|{row["processid"]}|{row["bin_uri"]}|{row["species"]}\n'
-        fh.write(defline)
+    with open(outfile, "w") as fh:
+        for _, row in famseq.iterrows():
+            defline = f'>{row["barcode_id"]}|ott{row["opentol_id"]}|{row["processid"]}|{row["bin_uri"]}|{row["species"]}\n'
+            fh.write(defline)
 
-        # Strip non-ACGT characters (dashes, esp.) because hmmer chokes on them
-        seq = row['nuc'].replace('-', '') + '\n'
-        fh.write(seq)
+            # Strip non-ACGT characters (dashes, esp.) because hmmer chokes on them
+            seq = row['nuc'].replace('-', '') + '\n'
+            fh.write(seq)
 
 
 if __name__ == '__main__':
@@ -106,15 +109,28 @@ def write_bin(q, conn, fh):
     parser.add_argument('-f', '--fasta_dir', required=True, help='Directory to write FASTA files to')
     parser.add_argument('-l', '--level', required=True, help='Taxonomic level to filter (e.g. order)')
     parser.add_argument('-n', '--name', required=True, help='Taxon name to filter (e.g. Primates)')
-    parser.add_argument('-c', '--chunks', required=True, help="Number of chunks (families) to write to file")
+    parser.add_argument('-L', '--limit', required=True, type=int, help='Fasta sequence limit, switch to lower rank if above (e.g. 200)')
     parser.add_argument('-m', '--marker', required=True, help='Marker code, e.g. COI-5P')
     parser.add_argument('-v', '--verbosity', required=True, help='Log level (e.g. DEBUG)')
     args = parser.parse_args()
     database_file = args.database
 
+    level = args.level.lower()
+    if level not in levels:
+        raise Exception(f"Filter level {level} from config file does not exists as a column in the database")
+
+    if levels.index(level) > levels.index("family"):
+        raise Exception("Level filter value must be 'family' or higher rank")
+
     # Configure logger
     logger = util.get_formatted_logger('family_fasta', args.verbosity)
 
+    try:
+        os.makedirs(args.fasta_dir, exist_ok=True)
+    except OSError as error:
+        logger.error(error)
+        exit(1)
+
     # Connect to the database (creates a new file if it doesn't exist)
     logger.info(f"Going to connect to database {args.database}")
     connection = sqlite3.connect(args.database)
@@ -128,29 +144,68 @@ def write_bin(q, conn, fh):
     }
     df = get_family_bins(query, connection)
 
-    # Iterate over distinct families
-    index = 1
-    for family in df['family'].unique():
-        logger.info(f"Writing {family}")
+    def write_fasta(query, rank, taxon, family_bin_uris, higher_ranks=None):
+        logger.info(f"Writing {taxon} ({rank})")
 
         # Make directory and open file handle
-        subdir = os.path.join(args.fasta_dir, f"{index}-of-{args.chunks}")
+        align_file = os.path.join(args.fasta_dir, "taxon", taxon, "unaligned.fa")
         try:
-            os.mkdir(subdir)
+            os.makedirs(Path(align_file).parent, exist_ok=True)
         except OSError as error:
-            logger.warning(error)
-
-        with open(os.path.join(subdir, 'unaligned.fa'), 'w') as handle:
-
-            # Iterate over bins in family
+            logger.error(error)
+            exit(1)
+
+        # Iterate over bins in family
+        for bin_uri in family_bin_uris:
+            logger.debug(f"Writing {bin_uri}")
+            query['bin_uri'] = bin_uri
+            query["rank"] = rank
+            query["taxon"] = taxon
+            write_bin(query, connection, align_file)
+
+    family_set = {}
+    genus_set = {}
+    # Iterate over distinct families
+    with open(os.path.join(args.fasta_dir, "taxon_fasta.tsv"), "w") as fw:
+        unique_families = df['family'].unique()
+        split_families = []
+        for family in unique_families:
             family_bin_uris = df[df['family'] == family]['bin_uri'].unique()
-            for bin_uri in family_bin_uris:
-                logger.debug(f"Writing {bin_uri}")
-                query['bin_uri'] = bin_uri
-                query['family'] = family
-                write_bin(query, connection, handle)
-
-        index += 1
+            if len(family_bin_uris) > args.limit:
+                split_families.append(family)
+                fw.write(f"{family}\tfamily\t{len(family_bin_uris)}\tTrue\t\n")
+                continue
+
+            write_fasta(query, "family", family, family_bin_uris)
+            fw.write(f"{family}\tfamily\t{len(family_bin_uris)}\tFalse\t\n")
+
+        split_genera = []
+        if split_families:
+            for family in split_families:
+                unique_genera = df[df['family'] == family]['genus'].unique()
+                for genus in unique_genera:
+                    if not genus:
+                        continue
+                    genus_bin_uris = df[(df['family'] == family) & (df['genus'] == genus)]['bin_uri'].unique()
+                    if len(genus_bin_uris) > args.limit:
+                        split_genera.append(genus)
+                        fw.write(f"{genus}\tgenus\t{len(genus_bin_uris)}\tTrue\t{family}\n")
+                        continue
+
+                    write_fasta(query, "genus", genus, genus_bin_uris)
+                    fw.write(f"{genus}\tgenus\t{len(genus_bin_uris)}\tFalse\t{family}\n")
+
+        if split_genera:
+            for genus in split_genera:
+                if not genus:
+                    continue
+                unique_species = df[df['genus'] == genus]['species'].unique()
+                for species in unique_species:
+                    species_bin_uris = df[(df['genus'] == genus) & (df['species'] == species)]['bin_uri'].unique()
+                    if len(species_bin_uris) > args.limit:
+                        raise NotImplementedError("This is impossible!")
+                    write_fasta(query, "species", species, species_bin_uris)
+                    fw.write(f"{species}\tspecies\t{len(species_bin_uris)}\tFalse\t{genus}\n")
 
     # Close the connection
     connection.close()
diff --git a/workflow/scripts/makeblastdb.sh b/workflow/scripts/makeblastdb.sh
index 3267d49..583e840 100644
--- a/workflow/scripts/makeblastdb.sh
+++ b/workflow/scripts/makeblastdb.sh
@@ -1,7 +1,6 @@
 DATABASE=$1
 FASTADIR=$2
-ITERATIONS=$3
-TMP=$4
+TMP=$3
 
 # This shell script, `makeblastdb.sh`, is responsible for creating a BLAST database from FASTA files generated by
 # `family_fasta.py`. The script performs the following steps:
@@ -20,10 +19,10 @@ TMP=$4
 # as a shell command in the rule `makeblastdb`.
 
 # doing this sequentially to avoid race conditions in BLAST indexing
-for i in $(seq 1 $ITERATIONS); do
-
-  # using the output from family_fasta
-  INFILE=${FASTADIR}/${i}-of-${ITERATIONS}/unaligned.fa
+i=0
+for TAXON in $(ls -d $FASTADIR/taxon/*); do
+  i+=1
+  INFILE=${TAXON}/unaligned.fa
 
   # only keep records with ott IDs, reformat the headers to retain the process ID, write to $TMP
   # start new file on first iteration, then append
diff --git a/workflow/scripts/map_opentol.py b/workflow/scripts/map_opentol.py
index 9847601..e3cdbad 100644
--- a/workflow/scripts/map_opentol.py
+++ b/workflow/scripts/map_opentol.py
@@ -133,6 +133,10 @@ def postprocess_db():
     logger.info(f'Going to connect to database {args.database}')
     conn = sqlite3.connect(args.database)
     cursor = conn.cursor()
+    cursor.execute('pragma journal_mode=OFF')
+    cursor.execute('PRAGMA synchronous=OFF')
+    cursor.execute('PRAGMA cache_size=100000')
+    cursor.execute('PRAGMA temp_store = MEMORY')
 
     # Infer taxonomic context from marker name
     if args.marker == "COI-5P":