Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
9464d2b
simplifying code and adding resources.yml
antgonza Oct 28, 2025
a6ffcc5
export ENVIRONMEN sooner in the script
antgonza Oct 28, 2025
b371545
add ENVIRONMENT in qp-pacbio yml
antgonza Oct 28, 2025
a2077b0
afterok -> afterany
antgonza Oct 28, 2025
77fc60e
CONDA_ENVIRONMENT
antgonza Oct 28, 2025
6e05385
fix tests
antgonza Oct 28, 2025
766bec5
CONDA_ENVIRONMENT
antgonza Oct 29, 2025
b6032b8
-J me_
antgonza Oct 29, 2025
ffdae88
adding missing params for merge
antgonza Oct 29, 2025
43a931f
data
antgonza Oct 29, 2025
5cf7de8
mv data to qp_pacbio
antgonza Oct 29, 2025
d11ba2a
find_base_path
antgonza Oct 29, 2025
8e6267a
--ignore=qp_pacbio/data
antgonza Oct 29, 2025
0071e1c
"results": result_fp,
antgonza Oct 30, 2025
80373a0
results -> result_fp
antgonza Oct 30, 2025
50a46cf
add completed
antgonza Oct 31, 2025
9a44417
output -> out_dir
antgonza Oct 31, 2025
fb42f44
SLURM_ARRAY_JOB_ID->SLURM_ARRAY_TASK_ID
antgonza Oct 31, 2025
bdb82df
rm extra hifiasm_meta
antgonza Oct 31, 2025
0cd35e7
validate failed_steps
antgonza Nov 1, 2025
a58a1f2
rm shopt
antgonza Nov 2, 2025
d2b227c
add file check FILES=(*.fa)
antgonza Nov 3, 2025
741e242
save small LCGs
antgonza Nov 5, 2025
52d76b2
update databases
antgonza Nov 5, 2025
6af6c17
forgot 1 update
antgonza Nov 5, 2025
f98217e
rm extra /
antgonza Nov 5, 2025
af437dd
update minimap2 woltka command
antgonza Nov 5, 2025
b9b5c76
nprocs -> 16
antgonza Nov 5, 2025
e58bb16
biom_merge_pacbio
antgonza Nov 6, 2025
9098888
woltka & biom
antgonza Nov 6, 2025
15bbe41
micov
antgonza Nov 6, 2025
a2775aa
pip https -> git
antgonza Nov 6, 2025
c6193a5
90 ->150
antgonza Nov 6, 2025
23fdf69
fix test
antgonza Nov 6, 2025
9d7fcb8
readd lcg_folder
antgonza Nov 6, 2025
e82c2f3
add finish_qp_pacbio to woltka
antgonza Nov 6, 2025
2286019
missing new line
antgonza Nov 6, 2025
813d03b
_small_LCGs -> _small_LCG
antgonza Nov 7, 2025
4a1fd74
09 -> 11 and default_params_set
antgonza Nov 10, 2025
20be910
default params should be a dict
antgonza Nov 10, 2025
75344ca
fixes after more testing
antgonza Nov 11, 2025
f694914
fix tests
antgonza Nov 11, 2025
b203403
rm >
antgonza Nov 11, 2025
5c9b92d
rm extras from if
antgonza Nov 11, 2025
5969f77
improve folder
antgonza Nov 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/qiita-plugin-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,9 @@ jobs:
pip --quiet install -U pip
pip --quiet install https://github.com/qiita-spots/qtp-job-output-folder/archive/refs/heads/main.zip

export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9"
pip install -e .
pip --quiet install coveralls
export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9"

configure_qtp_job_output_folder --env-script "source /home/runner/.profile; conda activate qp_pacbio_2025.9" --ca-cert $QIITA_ROOTCA_CERT
configure_qp_pacbio --env-script 'source /home/runner/.profile; conda activate qp_pacbio_2025.9; export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9"' --ca-cert $QIITA_ROOTCA_CERT
Expand Down Expand Up @@ -134,8 +134,9 @@ jobs:
export QIITA_ROOTCA_CERT=`pwd`/qiita-dev/qiita_core/support_files/ci_rootca.crt
export QIITA_CONFIG_FP=`pwd`/qiita-dev/qiita_core/support_files/config_test_local.cfg
export PYTHONWARNINGS="ignore:Certificate for localhost has no \`subjectAltName\`"
export ENVIRONMENT="source /home/runner/.profile; conda activate qp_pacbio_2025.9"

pytest qp_pacbio --doctest-modules --cov=qp_pacbio --cov-report=lcov
pytest qp_pacbio --doctest-modules --cov=qp_pacbio --cov-report=lcov --ignore=qp_pacbio/data

- uses: codecov/codecov-action@v3
with:
Expand Down
63 changes: 0 additions & 63 deletions data/templates/2.get-circular-genomes.sbatch

This file was deleted.

10 changes: 8 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ build-backend = "setuptools.build_meta"

[tool.setuptools]
packages = ["qp_pacbio"]
include-package-data = true

[tool.setuptools.package-data]
"qp_pacbio" = ["data/*"]

[project]
name = "qp_pacbio"
# version strings must comply with PEP 440:
# https://peps.python.org/pep-0440/
version = "2025.09"
version = "2025.11"
authors = [{ name = "Qiita Development Team", email = "[email protected]" }]
description = "Qiita Plugin: PacBio Processing"
readme = "README.rst"
Expand Down Expand Up @@ -39,11 +41,15 @@ dependencies = [
'pytest-cov',
'numpy',
'Jinja2',
'PyYAML',
"qiita-files@https://github.com/qiita-spots/qiita-files/archive/master.zip",
"qiita_client@https://github.com/qiita-spots/qiita_client/archive/master.zip",
"woltka@git+https://github.com/qiyunzhu/woltka.git#egg=woltka",
"micov@git+https://github.com/biocore/micov.git#egg=micov",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor but you can probably get micov from pip now.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice if we can install from bioconda

]

[project.scripts]
configure_qp_pacbio = "qp_pacbio.scripts:config"
start_qp_pacbio = "qp_pacbio.scripts:execute"
finish_qp_pacbio = "qp_pacbio.scripts:finish_qp_pacbio"
biom_merge_pacbio = "qp_pacbio.scripts:biom_merge"
10 changes: 4 additions & 6 deletions qp_pacbio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#

req_params = {"artifact": ("integer", ["per_sample_FASTQ"])}
opt_params = dict()
opt_params = {"Database": ['choice:["WoLr2"]', "WoLr2"]}
outputs = {
# taxonomic
"Per genome Predictions": "BIOM",
Expand All @@ -27,8 +27,7 @@
"KEGG Enzyme (EC)": "BIOM",
"KEGG Pathway": "BIOM",
}
dflt_param_set = dict()

dflt_param_set = {"WoLr2": {"Database": "WoLr2"}}
minimap2_cmd = QiitaCommand(
"Woltka v0.1.7, minimap2",
"Functional and Taxonomic Predictions",
Expand All @@ -47,10 +46,9 @@
req_params = {
"artifact": ("integer", ["per_sample_FASTQ"]),
}
opt_params = dict()
opt_params = {"Processing": ['choice:["default"]', "default"]}
outputs = {"output": "job-output-folder"}
dflt_param_set = dict()

dflt_param_set = {"default": {"Processing": "default"}}
pacbio_processing_cmd = QiitaCommand(
"PacBio processing",
"Default PacBio processing for Metagenomic Data",
Expand Down
60 changes: 60 additions & 0 deletions qp_pacbio/data/resources.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
PacBio processing:
step-1:
node_count: 1
nprocs: 16
wall_time_limit: 1-00:00:00
mem_in_gb: 200
max_tasks: 16
step-2:
node_count: 1
nprocs: 1
wall_time_limit: 00:10:00
mem_in_gb: 2
max_tasks: 16
step-3:
node_count: 1
nprocs: 8
wall_time_limit: 01:00:00
mem_in_gb: 10
max_tasks: 16
step-4:
node_count: 1
nprocs: 8
wall_time_limit: 01:00:00
mem_in_gb: 6
max_tasks: 16
step-5:
node_count: 1
nprocs: 8
wall_time_limit: 00:30:00
mem_in_gb: 2
max_tasks: 16
step-6:
node_count: 1
nprocs: 8
wall_time_limit: 00:30:00
mem_in_gb: 2
max_tasks: 16
step-7:
node_count: 1
nprocs: 8
wall_time_limit: 01:00:00
mem_in_gb: 50
max_tasks: 16
finish:
node_count: 1
nprocs: 1
wall_time_limit: 00:10:00
mem_in_gb: 10
Woltka v0.1.7, minimap2:
minimap2:
node_count: 1
nprocs: 16
wall_time_limit: 10:00:00
mem_in_gb: 60
max_tasks: 16
merge:
node_count: 1
nprocs: 16
wall_time_limit: 1-00:00:00
mem_in_gb: 120
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

source ~/.bashrc
set -e
conda activate {{conda_environment}}
{{conda_environment}}
cd {{output}}/step-1

step=${SLURM_ARRAY_TASK_ID}
Expand All @@ -28,3 +28,4 @@ if [[ "$step" == "1" ]]; then
fi

hifiasm_meta -t {{nprocs}} -o {{output}}/step-1/${sample_name} ${filename}
touch {{output}}/step-1/completed_${SLURM_ARRAY_TASK_ID}.log
96 changes: 96 additions & 0 deletions qp_pacbio/data/templates/2.get-circular-genomes.sbatch
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/bin/bash
#SBATCH -J {{job_name}}
#SBATCH -p qiita
#SBATCH -N {{node_count}}
#SBATCH -n {{nprocs}}
#SBATCH --time {{wall_time_limit}}
#SBATCH --mem {{mem_in_gb}}G
#SBATCH -o {{output}}/step-2/logs/%x-%A_%a.out
#SBATCH -e {{output}}/step-2/logs/%x-%A_%a.err
#SBATCH --array {{array_params}}

source ~/.bashrc
set -e
{{conda_environment}}
cd {{output}}/step-1

step=${SLURM_ARRAY_TASK_ID}
input=$(head -n $step {{output}}/sample_list.txt | tail -n 1)
sample_name=`echo $input | awk '{print $1}'`
filename=`echo $input | awk '{print $2}'`
fn=`basename ${filename}`

# updating the GUI when task 1 runs
if [[ "$step" == "1" ]]; then
python -c "from qp_pacbio.util import client_connect; qclient = client_connect('{{url}}'); qclient.update_job_step('{{qjid}}', 'Running step 2: ${SLURM_ARRAY_JOB_ID}')"
fi

cat ${sample_name}.p_ctg.gfa | awk '$1=="S" && ($2 ~ /.c$/) {printf ">%s\n%s\n", $2, $3} ' > ../step-2/${sample_name}_circ.fa
seqkit split --by-id ../step-2/${sample_name}_circ.fa -O ../step-2/${sample_name}_split

### get all contigs for each sample
cat ${sample_name}.p_ctg.gfa | awk '$1=="S" {printf ">%s\n%s\n", $2, $3} ' > ../step-2/${sample_name}_all_contigs.fa

cd ../step-2/${sample_name}_split
# making a copy of the small_LCG before they are removed
mkdir -p {{output}}/step-2/${sample_name}_small_LCG
find . -maxdepth 1 -type f -size -512k -print0 | xargs -0 -r cp -t ../${sample_name}_small_LCG
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So small_LCG is defined by files in size < 512kb? Probably important to note in the documentation for the PacBio workflow.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was expecting small_LCG to be defined by total genome size

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jianshu93, can you comment?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

by file size for now. Can be optimized, they are proportational to total genome size.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

approximate 515,000 bases (half a million), because one character takes one byte approximately.

### remove small circular genomes
find . -type f -size -512k -exec rm -f {} +

# this can result on not having any files left so
# making sure we have files left
#
# extract fasta id for all the genomes in the split folder
FILES=(*.fa)
if [ -f $FILES ]; then
for f in *.fa; do
k=${f##*/}
n=${f%.*}
grep -E "^>" $f >> circular_id.txt
done
sed -i 's/>//' circular_id.txt
seqkit grep -v -f circular_id.txt ../${sample_name}_all_contigs.fa > ../${sample_name}_noLCG.fa
else
cp ../${sample_name}_all_contigs.fa ../${sample_name}_noLCG.fa
fi

lcg_folder={{result_fp}}/${sample_name}/LCG/
mkdir -p ${lcg_folder}
FILES=({{output}}/step-2/${sample_name}_split/*.fa)
if [ -f $FILES ]; then
for f in `ls {{output}}/step-2/${sample_name}_split/*.fa`; do
sn=`basename ${f/_circ/}`;
sn=${sn/part_/};
cat $f | gzip > ${lcg_folder}/${sn/.fa/.fna}.gz;
done
fi

mkdir -p {{result_fp}}/${sample_name}/
if [ -f {{output}}/step-2/${sample_name}_noLCG.fa ]; then
cat {{output}}/step-2/${sample_name}_noLCG.fa | gzip > {{result_fp}}/${sample_name}/${sample_name}.noLCG.fna.gz
fi

touch {{output}}/step-2/completed_${SLURM_ARRAY_TASK_ID}.log
# if the files don't exist, it means that this step didn't generate any
# inputs for the next step; thus generating all the completed files
if [[ ! -f "$FILES" && ! -f "{{output}}/step-2/${sample_name}_noLCG.fa" ]]; then
touch {{output}}/step-3/completed_${SLURM_ARRAY_TASK_ID}.log
touch {{output}}/step-4/completed_${SLURM_ARRAY_TASK_ID}.log
touch {{output}}/step-5/completed_${SLURM_ARRAY_TASK_ID}.log
touch {{output}}/step-6/completed_${SLURM_ARRAY_TASK_ID}.log
touch {{output}}/step-7/completed_${SLURM_ARRAY_TASK_ID}.log
fi

# saving small LCG, note that these are not processed downstrem so not
# relevant to the "complete" files
small_lcg_folder={{result_fp}}/${sample_name}/small_LCG/
mkdir -p ${small_lcg_folder}
FILES=({{output}}/step-2/${sample_name}_small_LCG/*.fa)
if [ -f $FILES ]; then
for f in `ls {{output}}/step-2/${sample_name}_small_LCG/*.fa`; do
sn=`basename ${f/_circ/}`;
sn=${sn/part_/};
cat $f | gzip > ${small_lcg_folder}/${sn/.fa/.fna}.gz;
done
fi
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

source ~/.bashrc
set -e
conda activate {{conda_environment}}
{{conda_environment}}
cd {{output}}

step=${SLURM_ARRAY_TASK_ID}
Expand All @@ -28,5 +28,14 @@ fi

folder=step-3/${sample_name}_binning
mkdir -p ${folder}
minimap2 -x map-hifi -t {{nprocs}} -a --MD --eqx -o ${folder}/${sample_name}.sam step-2/${sample_name}_noLCG.fa ${filename}
samtools view -bS -@4 ${folder}/${sample_name}.sam | samtools sort -@4 -O bam -o ${folder}/${sample_name}.sorted.bam

if [ -f step-2/${sample_name}_noLCG.fa ]; then
minimap2 -x map-hifi -I {{mem_in_gb}}G -t {{nprocs}} -a --MD --eqx -o ${folder}/${sample_name}.sam step-2/${sample_name}_noLCG.fa ${filename}
samtools view -bS -@4 ${folder}/${sample_name}.sam | samtools sort -@4 -O bam -o ${folder}/${sample_name}.sorted.bam
else
touch {{output}}/step-4/completed_${SLURM_ARRAY_TASK_ID}.log
touch {{output}}/step-5/completed_${SLURM_ARRAY_TASK_ID}.log
touch {{output}}/step-6/completed_${SLURM_ARRAY_TASK_ID}.log
touch {{output}}/step-7/completed_${SLURM_ARRAY_TASK_ID}.log
fi
touch {{output}}/step-3/completed_${SLURM_ARRAY_TASK_ID}.log
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

source ~/.bashrc
set -e
conda activate {{conda_environment}}
{{conda_environment}}
cd {{output}}

step=${SLURM_ARRAY_TASK_ID}
Expand All @@ -26,13 +26,11 @@ if [[ "$step" == "1" ]]; then
python -c "from qp_pacbio.util import client_connect; qclient = client_connect('{{url}}'); qclient.update_job_step('{{qjid}}', 'Running step 4: ${SLURM_ARRAY_JOB_ID}')"
fi

folder=step-4/${sample_name}_binning

mkdir -p {{output}}/${folder}/work_files/
cp {{output}}/step-3/${sample_name}_binning/${sample_name}.sorted.bam {{output}}/step-4/${sample_name}_binning/work_files/${sample_name}.bam
rm {{output}}/step-3/${sample_name}_binning/${sample_name}.sorted.bam
folder={{output}}/step-4/${sample_name}_binning/work_files/
mkdir -p ${folder}
cp {{output}}/step-3/${sample_name}_binning/${sample_name}.sorted.bam ${folder}/${sample_name}.bam

ln -s ${filename} {{output}}/step-4/${sample_name}_binning/work_files/${sample_name}.fastq
metawrap binning -a {{output}}/step-2/${sample_name}_noLCG.fa -o {{output}}/step-4/${sample_name}_binning \
-t {{nprocs}} -m 100 -l 16000 --single-end --metabat2 --maxbin2 --concoct --universal {{output}}/step-4/${sample_name}_binning/work_files/${sample_name}.fastq
rm -rf {{output}}/step-4/${sample_name}_binning/work_files
-t {{nprocs}} -m 100 -l 16000 --single-end --metabat2 --maxbin2 --concoct --universal ${folder}/${sample_name}.fastq
touch {{output}}/step-4/completed_${SLURM_ARRAY_TASK_ID}.log
Loading
Loading