Skip to content

Commit 2d60bfd

Browse files
authored
add syndna command (#15)
* add syndna command * fix error * adding tests * syndna CONDA_ENVIRONMENT
1 parent 11de51b commit 2d60bfd

File tree

11 files changed

+399
-58
lines changed

11 files changed

+399
-58
lines changed

.github/workflows/qiita-plugin-ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ jobs:
145145
lint:
146146
runs-on: ubuntu-latest
147147
steps:
148-
- name: flake8
148+
- name: ruff
149149
uses: actions/setup-python@v2
150150
with:
151151
python-version: 3.9
@@ -155,5 +155,5 @@ jobs:
155155
uses: actions/checkout@v2
156156
- name: lint
157157
run: |
158-
pip install -q flake8
159-
flake8 .
158+
pip install -q ruff
159+
ruff check .

README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ Qiita plugin to process PacBio reads; it currently provides 2 commands for Qiita
88
* **PacBio processing**: which goes from step 1 to 7 in the image below. The expected output
99
is a main folder with folders per-sample and folders for each of the different outputs, as follows:
1010

11-
* **MAG** folder: all Metagenome-Assembled Genome (MAG) generatedfor that sample
11+
* **MAG** folder: all Metagenome-Assembled Genome (MAG) generated for that sample
1212
* **LCG** folder: all Long-Circular Genome (LCG) generated for that sample that are over 512kb in size - approximate 515,000 bases (half a million)
1313
* **small_LCG** folder: all Long-Circular Genome (LCG) generated for that sample that are under 512kb in size
1414
* **[sample-name].fna.gz**: the no LCG reads used for MAG generation
15-
* **[sample-name].checkm.txt.gz**: MAG quanlity information from CheckM v1.2.3
15+
* **[sample-name].checkm.txt.gz**: MAG quality information from CheckM v1.2.3
1616

1717

1818
.. image:: images/PacBioProcessing.png

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ dependencies = [
3636
"click>=3.3",
3737
"pandas",
3838
'requests',
39-
'flake8',
39+
'ruff',
4040
'coverage',
4141
'pytest-cov',
4242
'numpy',
@@ -46,6 +46,8 @@ dependencies = [
4646
"qiita-files@https://github.com/qiita-spots/qiita-files/archive/master.zip",
4747
"qiita_client@https://github.com/qiita-spots/qiita_client/archive/master.zip",
4848
"woltka@git+https://github.com/qiyunzhu/woltka.git#egg=woltka",
49+
"pysyndna@git+https://github.com/biocore/pysyndna.git#egg=pysyndna",
50+
'micov',
4951
]
5052

5153
[project.scripts]

qp_pacbio/__init__.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,14 @@
55
#
66
# The full license is in the file LICENSE, distributed with this software.
77
# -----------------------------------------------------------------------------
8-
from qiita_client import QiitaPlugin, QiitaCommand
9-
from .qp_pacbio import pacbio_processing, minimap2_processing
10-
from .util import plugin_details
8+
from qiita_client import QiitaCommand, QiitaPlugin
119

10+
from .qp_pacbio import (
11+
minimap2_processing,
12+
pacbio_processing,
13+
syndna_processing,
14+
)
15+
from .util import plugin_details
1216

1317
plugin = QiitaPlugin(**plugin_details)
1418

@@ -60,3 +64,29 @@
6064
)
6165

6266
plugin.register_command(pacbio_processing_cmd)
67+
68+
#
69+
# syndna filtering
70+
#
71+
72+
req_params = {
73+
"artifact": ("artifact", ["per_sample_FASTQ"]),
74+
}
75+
opt_params = {"min_sample_counts": ("integer", "1")}
76+
outputs = {
77+
"SynDNA hits": "BIOM",
78+
"reads without SynDNA": "per_sample_FASTQ",
79+
}
80+
dflt_param_set = {
81+
"SynDNA": {"min_sample_counts": 1},
82+
}
83+
pacbio_processing_cmd = QiitaCommand(
84+
"Remove SynDNA plasmid, insert, & GCF_000184185 reads (minimap2)",
85+
"Remove SynDNA reads using minimap2 and woltka",
86+
syndna_processing,
87+
req_params,
88+
opt_params,
89+
outputs,
90+
dflt_param_set,
91+
)
92+
plugin.register_command(pacbio_processing_cmd)

qp_pacbio/data/resources.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,15 @@ Woltka v0.1.7, minimap2:
5858
nprocs: 16
5959
wall_time_limit: 1-00:00:00
6060
mem_in_gb: 120
61+
Remove SynDNA plasmid, insert, & GCF_000184185 reads (minimap2):
62+
syndna:
63+
node_count: 1
64+
nprocs: 16
65+
wall_time_limit: 10:00:00
66+
mem_in_gb: 60
67+
max_tasks: 16
68+
finish:
69+
node_count: 1
70+
nprocs: 16
71+
wall_time_limit: 1-00:00:00
72+
mem_in_gb: 120

qp_pacbio/data/templates/syndna.sbatch

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,25 +23,28 @@ sample_name=`echo $input | awk '{print $1}'`
2323
filename=`echo $input | awk '{print $2}'`
2424
fn=`basename ${filename}`
2525

26-
mkdir -p ${out_folder}/clean/
26+
mkdir -p ${out_folder}/filtered/
27+
28+
sn_folder=${out_folder}/bioms/${sample_name}
29+
mkdir -p ${sn_folder}
30+
31+
coverm contig --single $filename --reference ${db_folder}/All_synDNA_inserts.fasta --mapper minimap2-hifi \
32+
--min-read-percent-identity 0.95 --min-read-aligned-percent 0.0 -m mean count --threads {{nprocs}} \
33+
--output-file ${sn_folder}/${sample_name}.txt
34+
cat ${sn_folder}/${sample_name}_insert_counts.txt | sed 's/Contig/\#OTU ID/' | \
35+
sed 's/ Read Count//' > ${sn_folder}/${sample_name}.tsv
36+
biom convert -i ${sn_folder}/${sample_name}.txt -o ${sn_folder}/${sample_name}.biom --to-hdf5
2737

2838
# removing AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta not coverm
2939
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_plasmid.sam ${db_folder}/AllsynDNA_plasmids_FASTA_ReIndexed_FINAL.fasta $filename
3040
samtools view -F 4 -@ {{nprocs}} ${out_folder}/${sample_name}_plasmid.sam | awk '{print $1}' | sort -u > ${out_folder}/${sample_name}_plasmid_mapped.txt
3141
seqkit grep -v -f ${out_folder}/${sample_name}_plasmid_mapped.txt $filename > ${out_folder}/${sample_name}_no_plasmid.fastq
3242

33-
# removing All_synDNA_inserts.fasta use coverm
34-
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_inserts.sam ${db_folder}/All_synDNA_inserts.fasta ${out_folder}/${sample_name}_no_plasmid.fastq
35-
samtools view -bS -@ {{int(nprocs/2)}} ${out_folder}/${sample_name}_no_plasmid.fastq | samtools sort -@ {{int(nprocs/2)}} -O bam -o ${out_folder}/${sample_name}_inserts_sorted.sam
36-
coverm filter --bam-files ${out_folder}/${sample_name}_inserts_sorted.sam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads {{nprocs}} -o ${out_folder}/${sample_name}_no_inserts.bam
37-
samtools view -O SAM -o ${out_folder}/${sample_name}_no_inserts_sorted.sam ${out_folder}/${sample_name}_no_inserts.bam
38-
awk '{print $1}' ${out_folder}/${sample_name}_no_inserts_sorted.sam > ${out_folder}/${sample_name}_reads_filtered.txt
39-
seqkit grep -v -f ${out_folder}/${sample_name}_reads_filtered.txt ${out_folder}/${sample_name}_no_plasmid.fastq > ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq
40-
4143
# removing GCF_000184185.1_ASM18418v1_genomic_chroso.fna use coverm
4244
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${out_folder}/${sample_name}_GCF_000184185.sam ${db_folder}/GCF_000184185.1_ASM18418v1_genomic_chroso.fna ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq
43-
samtools view -bS -@ {{int(nprocs/2)}} ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq | samtools sort -@ {{int(nprocs/2)}} -O bam -o ${out_folder}/${sample_name}_GCF_000184185_sorted.sam
45+
samtools view -bS -@ {{ nprocs/2 | int }} ${out_folder}/${sample_name}_no_plasmid_no_inserts.fastq | samtools sort -@ {{ nprocs/2 | int }} -O bam -o ${out_folder}/${sample_name}_GCF_000184185_sorted.sam
4446
coverm filter --bam-files ${out_folder}/${sample_name}_GCF_000184185_sorted.sam --min-read-percent-identity 99.9 --min-read-aligned-percent 95 --threads {{nprocs}} -o ${out_folder}/${sample_name}_GCF_000184185.bam
4547
samtools view -O SAM -o ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam ${out_folder}/${sample_name}_no_inserts.bam
4648
awk '{print $1}' ${out_folder}/${sample_name}_no_GCF_000184185_sorted.sam > ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt
47-
seqkit grep -v -f ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt ${out_folder}/${sample_name}_GCF_000184185.fastq | gz > ${out_folder}/clean/${fn}
49+
seqkit grep -v -f ${out_folder}/${sample_name}_GCF_000184185_reads_filtered.txt ${out_folder}/${sample_name}_GCF_000184185.fastq | gz > ${out_folder}/filtered/${fn}
50+
awk 'BEGIN {FS=OFS="\t"}; {print $1,$3}'
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
#SBATCH -J {{job_name}}
3+
#SBATCH -p qiita
4+
#SBATCH -N {{node_count}}
5+
#SBATCH -n {{nprocs}}
6+
#SBATCH --time {{wall_time_limit}}
7+
#SBATCH --mem {{mem_in_gb}}G
8+
#SBATCH -o {{output}}/merge/logs/%x-%A_%a.out
9+
#SBATCH -e {{output}}/merge/logs/%x-%A_%a.err
10+
11+
source ~/.bashrc
12+
set -e
13+
{{conda_environment}}
14+
cd {{output}}/
15+
16+
biom_merge_pacbio --base {{output}} --type syndna
17+
18+
# find {{output}}/coverages/ -iname "*.cov" > {{output}}/cov_files.txt
19+
# micov consolidate --paths {{output}}/cov_files.txt --lengths ${len_map} --output {{output}}/coverages.tgz
20+
21+
# cd alignment
22+
# tar -cvf ../alignment.tar *.sam.xz
23+
24+
finish_qp_pacbio {{url}} {{qjid}} {{output}}

qp_pacbio/data/templates/woltka_minimap2_merge.sbatch

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ for f in `ls bioms/*/per-gene.biom`; do
3939
done | parallel --halt now,fail=1 -j {{nprocs}}
4040
wait
4141

42-
biom_merge_pacbio --base {{output}}
42+
biom_merge_pacbio --base {{output}} --type woltka
4343

4444
find {{output}}/coverages/ -iname "*.cov" > {{output}}/cov_files.txt
4545
micov consolidate --paths {{output}}/cov_files.txt --lengths ${len_map} --output {{output}}/coverages.tgz

qp_pacbio/qp_pacbio.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,13 @@
1111
from shutil import copy2
1212
from subprocess import run
1313

14+
import pandas as pd
1415
import yaml
16+
from biom import load_table
1517
from jinja2 import Environment
18+
from pysyndna import (
19+
fit_linear_regression_models_for_qiita,
20+
)
1621
from qiita_client import ArtifactInfo
1722

1823
from .util import KISSLoader, find_base_path
@@ -424,3 +429,147 @@ def generate_minimap2_processing(qclient, job_id, out_dir, parameters, url):
424429
minimap2_merge_script = _write_slurm(f"{out_dir}/merge", m2mt, **params)
425430

426431
return minimap2_script, minimap2_merge_script
432+
433+
434+
def syndna_processing(qclient, job_id, parameters, out_dir):
435+
"""generates output for syndna processing.
436+
437+
Parameters
438+
----------
439+
qclient : tgp.qiita_client.QiitaClient
440+
Qiita server client.
441+
job_id : str
442+
Job id.
443+
parameters : dict
444+
Parameters for this job.
445+
out_dir : str
446+
Output directory.
447+
448+
Returns
449+
-------
450+
bool, list, str
451+
Results tuple for Qiita.
452+
"""
453+
qclient.update_job_step(job_id, "Commands finished")
454+
455+
errors = []
456+
ainfo = []
457+
fp_biom = f"{out_dir}/syndna.biom"
458+
# do we need to stor alignments?
459+
# fp_alng = f'{out_dir}/sams/final/alignment.tar'
460+
461+
if exists(fp_biom): # and exists(fp_alng):
462+
# if we got to this point a preparation file should exist in
463+
# the output folder
464+
prep = pd.read_csv(f"{out_dir}/prep_info.tsv", index_col=None, sep="\t")
465+
output = fit_linear_regression_models_for_qiita(
466+
prep, load_table(fp_biom), int(parameters["min_sample_counts"])
467+
)
468+
# saving results to disk
469+
lin_regress_results_fp = f"{out_dir}/lin_regress_by_sample_id.yaml"
470+
fit_syndna_models_log_fp = f"{out_dir}/fit_syndna_models_log.txt"
471+
with open(lin_regress_results_fp, "w") as fp:
472+
fp.write(output["lin_regress_by_sample_id"])
473+
with open(fit_syndna_models_log_fp, "w") as fp:
474+
fp.write(output["fit_syndna_models_log"])
475+
ainfo = [
476+
ArtifactInfo(
477+
"SynDNA hits",
478+
"BIOM",
479+
[
480+
(fp_biom, "biom"),
481+
# rm if fp_alng is not needed
482+
# (fp_alng, "log"),
483+
(lin_regress_results_fp, "log"),
484+
(fit_syndna_models_log_fp, "log"),
485+
],
486+
)
487+
]
488+
else:
489+
ainfo = []
490+
errors.append(
491+
'Missing files from the "SynDNA hits"; please '
492+
"contact [email protected] for more information"
493+
)
494+
495+
fp_seqs = f"{out_dir}/filtered"
496+
reads = []
497+
for f in glob(f"{fp_seqs}/*.fastq.gz"):
498+
reads.append((f, "raw_forward_seqs"))
499+
500+
if not errors:
501+
ainfo.append(ArtifactInfo("reads without SynDNA", "per_sample_FASTQ", reads))
502+
else:
503+
return False, ainfo, "\n".join(errors)
504+
505+
return True, ainfo, ""
506+
507+
508+
def generate_syndna_processing(qclient, job_id, out_dir, parameters, url):
509+
"""generates slurm scripts for syndna processing.
510+
511+
Parameters
512+
----------
513+
qclient : tgp.qiita_client.QiitaClient
514+
Qiita server client.
515+
job_id : str
516+
Job id.
517+
out_dir : str
518+
Output directory.
519+
parameters : dict
520+
Parameters for this job.
521+
url : str
522+
URL to send the respose, finish the job.
523+
524+
Returns
525+
-------
526+
str, str
527+
Returns the two filepaths of the slurm scripts
528+
"""
529+
resources = RESOURCES[
530+
"Remove SynDNA plasmid, insert, & GCF_000184185 reads (minimap2)"
531+
]
532+
main_parameters = {
533+
"conda_environment": CONDA_ENVIRONMENT,
534+
"output": out_dir,
535+
"qjid": job_id,
536+
}
537+
538+
qclient.update_job_step(
539+
job_id, "Step 1 of 4: Collecting info and generating submission"
540+
)
541+
542+
artifact_id = parameters["artifact"]
543+
544+
njobs = generate_sample_list(qclient, artifact_id, out_dir)
545+
546+
qclient.update_job_step(
547+
job_id,
548+
"Step 2 of 4: Creating submission templates",
549+
)
550+
551+
m2t = JGT("syndna.sbatch")
552+
step_resources = resources["syndna"]
553+
params = main_parameters | {
554+
"job_name": f"sd_{job_id}",
555+
"node_count": step_resources["node_count"],
556+
"nprocs": step_resources["nprocs"],
557+
"wall_time_limit": step_resources["wall_time_limit"],
558+
"mem_in_gb": step_resources["mem_in_gb"],
559+
"array_params": f"1-{njobs}%{step_resources['max_tasks']}",
560+
}
561+
minimap2_script = _write_slurm(f"{out_dir}/minimap2", m2t, **params)
562+
563+
m2mt = JGT("syndna_finish.sbatch")
564+
step_resources = resources["finish"]
565+
params = main_parameters | {
566+
"job_name": f"me_{job_id}",
567+
"node_count": step_resources["node_count"],
568+
"nprocs": step_resources["nprocs"],
569+
"wall_time_limit": step_resources["wall_time_limit"],
570+
"mem_in_gb": step_resources["mem_in_gb"],
571+
"url": url,
572+
}
573+
minimap2_merge_script = _write_slurm(f"{out_dir}/merge", m2mt, **params)
574+
575+
return minimap2_script, minimap2_merge_script

0 commit comments

Comments
 (0)