Skip to content

Alternative VSEARCH pipeline

Torbjørn Rognes edited this page Feb 15, 2021 · 4 revisions

Alternative VSEARCH pipeline

This is an example of how you may use VSEARCH to process a 16S rRNA dataset and obtain OTUs.

It is a more traditional pipeline and in contrast to the earlier example pipeline, it does not use a custom perl script.

The example also shows how Swarm may be used for clustering (currently commented out).

#!/bin/sh

# This is an example of a pipeline using vsearch to process data in the
# Mothur 16S rRNA MiSeq SOP tutorial dataset to perform initial paired-end
# read merging, quality filtering, chimera removal and OTU clustering.

THREADS=1
PERL=$(which perl)
VSEARCH=$(which vsearch)
SWARM=$(which swarm)
REF=../gold.fasta
MISEQSOPDATA="https://mothur.s3.us-east-2.amazonaws.com/wiki/miseqsopdata.zip"
GOLD="https://mothur.s3.us-east-2.amazonaws.com/wiki/silva.gold.bacteria.zip"

CLUSTERID=0.97
MAXEE=1.0

date

echo Obtaining Mothur MiSeq SOP tutorial dataset

if [ ! -e miseqsopdata.zip ]; then
    wget $MISEQSOPDATA
fi

echo Decompressing...
unzip -u -o miseqsopdata.zip

echo
echo Obtaining Gold reference database for chimera detection

if [ ! -e gold.fasta ]; then

    if [ ! -e silva.gold.bacteria.zip ]; then
        wget $GOLD
    fi

    echo Decompressing and reformatting...
    unzip -p silva.gold.bacteria.zip silva.gold.align | \
        sed -e "s/[.-]//g" > gold.fasta

fi

# Enter subdirectory

cd MiSeq_SOP

echo
echo Checking FASTQ format version for one file

$VSEARCH --fastq_chars $(ls -1 *.fastq | head -1)

# Process samples

for f in *_R1_*.fastq; do

    r=$(sed -e "s/_R1_/_R2_/" <<< "$f")
    s=$(cut -d_ -f1 <<< "$f")

    echo
    echo ====================================
    echo Processing sample $s
    echo ====================================
    echo

    $VSEARCH --fastq_mergepairs $f \
        --threads $THREADS \
        --reverse $r \
        --fastqout $s.merged.fastq \
        --fastq_eeout

    # Commands to demultiplex and remove tags and primers
    # using e.g. cutadapt may be added here.

    echo
    echo Calculate quality statistics
    echo

    $VSEARCH --fastq_eestats $s.merged.fastq \
        --output $s.stats

    echo
    echo Quality filtering
    echo

    $VSEARCH --fastq_filter $s.merged.fastq \
        --fastq_maxee $MAXEE \
        --fastq_minlen 225 \
        --fastq_maxlen 275 \
        --fastq_maxns 0 \
        --fastaout $s.filtered.fasta \
        --fasta_width 0

    echo
    echo Dereplicate at sample level and relabel with sample.n
    echo

    $VSEARCH --derep_fulllength $s.filtered.fasta \
        --strand plus \
        --output $s.derep.fasta \
        --sizeout \
        --relabel $s. \
        --fasta_width 0

done

# At this point there should be one fasta file for each sample
# It should be quality filtered and dereplicated.

echo
echo ====================================
echo Processing all samples together
echo ====================================
echo
echo Merge all samples

cat *.derep.fasta > all.fasta

echo
echo Sum of unique sequences in each sample: $(cat all.fasta | grep -c "^>")
echo
echo Dereplicate across samples
echo

$VSEARCH --derep_fulllength all.fasta \
    --threads $THREADS \
    --sizein \
    --sizeout \
    --fasta_width 0 \
    --uc all.derep.uc \
    --output derep.fasta

echo
echo Unique sequences: $(grep -c "^>" derep.fasta)
echo
echo Cluster sequences using VSEARCH
echo

$VSEARCH --cluster_size derep.fasta \
    --threads $THREADS \
    --id $CLUSTERID \
    --strand plus \
    --sizein \
    --sizeout \
    --fasta_width 0 \
    --centroids centroids.fasta

#echo
#echo Cluster with Swarm using d=1 and fastidious mode
#echo
#
#$SWARM derep.fasta \
#    --threads $THREADS \
#    --differences 1 \
#    --fastidious \
#    --seeds centroids.fasta \
#    --usearch-abundance \
#    --output /dev/null

echo
echo Clusters: $(grep -c "^>" centroids.fasta)
echo
echo Sort and remove singletons
echo

$VSEARCH --sortbysize centroids.fasta \
    --threads $THREADS \
    --sizein \
    --sizeout \
    --fasta_width 0 \
    --minsize 2 \
    --output sorted.fasta

echo
echo Non-singleton clusters: $(grep -c "^>" sorted.fasta)
echo 
echo De novo chimera detection
echo

$VSEARCH --uchime_denovo sorted.fasta \
    --sizein \
    --sizeout \
    --fasta_width 0 \
    --qmask none \
    --nonchimeras denovo.nonchimeras.fasta \

echo
echo Unique sequences after de novo chimera detection: $(grep -c "^>" denovo.nonchimeras.fasta)
echo
echo Reference chimera detection
echo

$VSEARCH --uchime_ref denovo.nonchimeras.fasta \
    --threads $THREADS \
    --db $REF \
    --sizein \
    --sizeout \
    --fasta_width 0 \
    --qmask none \
    --dbmask none \
    --nonchimeras nonchimeras.fasta

echo
echo Unique sequences after reference-based chimera detection: $(grep -c "^>" nonchimeras.fasta)
echo
echo Relabel OTUs
echo

$VSEARCH --fastx_filter nonchimeras.fasta \
    --threads $THREADS \
    --sizein \
    --sizeout \
    --fasta_width 0 \
    --relabel OTU_ \
    --fastaout otus.fasta

echo
echo Number of OTUs: $(grep -c "^>" otus.fasta)
echo
echo Map sequences to OTUs by searching
echo

$VSEARCH --usearch_global all.fasta \
    --threads $THREADS \
    --db otus.fasta \
    --id $CLUSTERID \
    --strand plus \
    --sizein \
    --sizeout \
    --fasta_width 0 \
    --qmask none \
    --dbmask none \
    --otutabout otutab.txt

echo
echo Sort OTU table numerically
echo

sort -k1.5n otutab.txt > otutab.sorted.txt

echo
echo Done

date