Skip to content
4 changes: 2 additions & 2 deletions 01_runBasecall.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

## Allocate resources
#SBATCH --time=3-00:00:00
#SBATCH --mem-per-cpu=8G
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=16G
#SBATCH --cpus-per-task=4

## job name
#SBATCH --job-name="guppy"
Expand Down
4 changes: 3 additions & 1 deletion 02_runAlignFastq.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
## Allocate resources
#SBATCH --time=2-00:00:00
#SBATCH --mem-per-cpu=8G
#SBATCH --array=4
#SBATCH --array=1-6
#SBATCH --mail-user=jennifer.semple@izb.unibe.ch
#SBATCH --mail-type=END,FAIL
## you should submit as many jobs as there are barcodes in barcodesOfInterest
## (don't forget to include unclassfied in barcodesOfInterst in the varSettings.sh file)

Expand Down
33 changes: 33 additions & 0 deletions 02_runAlignFastqdb.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#! /bin/bash

## Allocate resources
#SBATCH --time=2-00:00:00
#SBATCH --mem-per-cpu=8G
#SBATCH --array=1-6
#SBATCH --mail-user=jennifer.semple@izb.unibe.ch
#SBATCH --mail-type=END,FAIL
## you should submit as many jobs as there are barcodes in barcodesOfInterest
## (don't forget to include unclassfied in barcodesOfInterst in the varSettings.sh file)

## job name
#SBATCH --job-name="npAlign"

# read in the run specific settings
source ./varSettings.sh


let i=$SLURM_ARRAY_TASK_ID-1

# organise fastq, do QC plots and align with minimap
./alignFastqdb.sh ${expName} ${barcodesOfInterest[$i]} ${genomeFile}


# if barcode is one with a spikein, align also to other genomes
for bc in ${bcWithSpikeIn[@]}
do
if [ "${barcodesOfInterest[$i]}" == "$bc" ]; then
./alignFastqdb_spikeIn.sh ${expName} ${barcodesOfInterest[$i]} ${lambdaFile}
./alignFastqdb_spikeIn.sh ${expName} ${barcodesOfInterest[$i]} ${phiXfile}
exit
fi
done
212 changes: 109 additions & 103 deletions alignFastq.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,14 @@
## script to arrange all fastq files into batches for analysis. Combined files will be created for
## each barcode of interest for both passed and failed reads (e.g. pass_barcode01 or fail_barcode01).

source ./varSettings.sh
# Get variables from command line
expName=$1 # experiment name (date of exp usually)
bc=$2 # barcode
genomeFile=$3 # full path to reference genome

# need absolute paths for nanopolish index. get it from the summary file.
#summaryFile=`readlink -f ../fastqFiles/sequencing_summary.txt`
expPath=`readlink -f ../`
#expPath=/data/projects/p025/Jenny/20190411_dSMFv021-025np_N2gw
summaryFile=${expPath}/fastqFiles/sequencing_summary.txt
summaryFile=${workDir}/fastqFiles/sequencing_summary.txt

####### modules to load ##########
module load vital-it
Expand All @@ -21,78 +19,61 @@ module add UHTS/Analysis/samtools/1.8;
############################################


##########################################################
# function to run Pauvre and nanoQC on individual batches
##########################################################

## activate python environment for QC programmes (Pauvre and NanoQC)
source activate albacore_env

## function for running Pauvre and NanoQC on each combined file
do_pauvre_nanoQC() {
# get variables from arguments
bc=$1 #barcode
pf=$2 #pass or fail folder
eN=$3 #expName
qc=$4 #qcDir
fq=${expPath}/bcFastq/${eN}_${pf}_${bc}.fastq.gz #input file to qc

# assemble names for some of the output files
outDir=${qc}/${pf}_${bc}
mkdir -p $outDir
outStats=${outDir}/pauvreStats.txt

# run QC programmes
pauvre stats -f $fq > ${outStats}
nanoQC $fq -o $outDir
}


################################################
# Collecting reads from barcodes that were used
################################################
echo "collecting reads from folder of barcodes that were used..."

# merge all reads from particular barcode into single file (pass fail separately)
echo ${expPath}/fast5Files

if [ -d ${expPath}/bcFastq/pass/${bc} ];
then
echo "passed reads..."
cat ${expPath}/bcFastq/pass/${bc}/*.fastq > ${expPath}/bcFastq/${expName}_pass_${bc}.fastq
gzip ${expPath}/bcFastq/${expName}_pass_${bc}.fastq
rm ${expPath}/bcFastq/${expName}_pass_${bc}.fastq
${NANOPOLISH_DIR}/nanopolish index -s ${summaryFile} -d ${expPath}/fast5Files ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz
do_pauvre_nanoQC $bc pass $expName ${expPath}/qc
fi

if [ -d ${expPath}/bcFastq/fail/${bc} ];
then
echo "failed reads ..."
cat ${expPath}/bcFastq/fail/${bc}/*.fastq > ${expPath}/bcFastq/${expName}_fail_${bc}.fastq
gzip ${expPath}/bcFastq/${expName}_fail_${bc}.fastq
rm ${expPath}/bcFastq/${expName}_fail_${bc}.fastq
${NANOPOLISH_DIR}/nanopolish index -s ${summaryFile} -d ${expPath}/fast5Files ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz
do_pauvre_nanoQC $bc fail $expName ${expPath}/qc
fi


################################################
# aligning to genome
################################################

echo "aligning to genome..."

mkdir -p ${expPath}/bamFiles

#map reads to genome with minimap2
# filter reads with flag=2308: unmapped (4) + secondary alignment (256) + supplementary alignment (2048) [the latter category is the main problem]
minimap2 -ax map-ont $genomeFile ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T ${expName}_pass_${bc} -o ${expPath}/bamFiles/${expName}_pass_${bc}.sorted.bam
minimap2 -ax map-ont $genomeFile ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T ${expName}_fail_${bc} -o ${expPath}/bamFiles/${expName}_fail_${bc}.sorted.bam

echo "index bam file ..."
samtools index ${expPath}/bamFiles/${expName}_pass_${bc}.sorted.bam
samtools index ${expPath}/bamFiles/${expName}_fail_${bc}.sorted.bam
source ${HOME}/.bashrc
source ${CONDA_ACTIVATE} nanopore


#################################################
## Collecting reads from barcodes that were used
#################################################
#echo "collecting reads from folder of barcodes that were used..."
#
## merge all reads from particular barcode into single file (pass fail separately)
#echo ${dataDir}/fast5Files
#mkdir -p ${workDir}/qc/NanoStat
#
#if [ -d "${workDir}/bcFastq/pass/${bc}" ];
#then
# echo "passed reads..."
# cat ${workDir}/bcFastq/pass/${bc}/*.fastq > ${workDir}/bcFastq/${expName}_pass_${bc}.fastq
# gzip ${workDir}/bcFastq/${expName}_pass_${bc}.fastq
# rm ${workDir}/bcFastq/${expName}_pass_${bc}.fastq
# nanopolish index -s ${summaryFile} -d ${dataDir}/fast5Files ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz
# mkdir -p ${workDir}/qc/pass_${bc}
# nanoQC ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz -o ${workDir}/qc/pass_${bc}
# NanoStat --fastq ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz --outdir ${workDir}/qc/NanoStat --name NanoStat_${expName}_pass_${bc}.txt --readtype 1D
#fi
#
#if [ -d "${workDir}/bcFastq/fail/${bc}" ];
#then
# echo "failed reads ..."
# cat ${workDir}/bcFastq/fail/${bc}/*.fastq > ${workDir}/bcFastq/${expName}_fail_${bc}.fastq
# gzip ${workDir}/bcFastq/${expName}_fail_${bc}.fastq
# rm ${workDir}/bcFastq/${expName}_fail_${bc}.fastq
# nanopolish index -s ${summaryFile} -d ${dataDir}/fast5Files ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz
# mkdir -p ${workDir}/qc/fail_${bc}
# nanoQC ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz -o ${workDir}/qc/fail_${bc}
# NanoStat --fastq ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz --outdir ${workDir}/qc/NanoStat --name NanoStat_${expName}_fail_${bc}.txt --readtype 1D
#fi
#
#
#################################################
## aligning to genome
#################################################
#
#echo "aligning to genome..."
#
#mkdir -p ${workDir}/bamFiles
#
## map reads to genome with minimap2
## filter reads with flag=2308: unmapped (4) + secondary alignment (256) + supplementary alignment (2048) [the latter category is the main problem]
#minimap2 -ax map-ont $genomeFile ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T pass_${bc} -o ${workDir}/bamFiles/${expName}_pass_${bc}.sorted.bam
#minimap2 -ax map-ont $genomeFile ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T fail_${bc} -o ${workDir}/bamFiles/${expName}_fail_${bc}.sorted.bam
#
#echo "index bam file ..."
#samtools index ${workDir}/bamFiles/${expName}_pass_${bc}.sorted.bam
#samtools index ${workDir}/bamFiles/${expName}_fail_${bc}.sorted.bam
#


################################################
Expand All @@ -111,76 +92,101 @@ chr=( `cut -f1 chrom.sizes` )
# identifying CmG
################################################
echo "identify CmG ..."
doFail=FALSE

mkdir -p ${expPath}/meth_calls/
mkdir -p ${workDir}/meth_calls/

for i in "${!chr[@]}"
do
${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${expPath}/bamFiles/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv

${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${expPath}/bamFiles/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv
nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${workDir}/bamFiles/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv
if [ "$doFail" = "TRUE" ]
then
echo "analysing failed reads"
nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${workDir}/bamFiles/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv
fi
done


#### combine separate chromosomes into single file ####

# first write header
head -1 ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv
head -1 ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv
head -1 ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv
if [ "$doFail" = TRUE ]
then
head -1 ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv
fi


# then combine files
for i in "${!chr[@]}"
do
tail -n +2 ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv
tail -n +2 ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv
rm ${expPath}/meth_calls/${expName}_????_${bc}_CpGcalls_${chr[$i]}.tsv
tail -n +2 ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv
if [ "$doFail" = TRUE ]
then
tail -n +2 ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv
fi
rm ${workDir}/meth_calls/${expName}_????_${bc}_CpGcalls_${chr[$i]}.tsv
done


#### caclulating frequency ######
mkdir -p ${expPath}/meth_freq
${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv > ${expPath}/meth_freq/${expName}_pass_${bc}_freqCmG.tsv

${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv > ${expPath}/meth_freq/${expName}_fail_${bc}_freqCmG.tsv
mkdir -p ${workDir}/meth_freq
${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv > ${workDir}/meth_freq/${expName}_pass_${bc}_freqCmG.tsv

if [ "$doFail" = TRUE ]
then
${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv > ${workDir}/meth_freq/${expName}_fail_${bc}_freqCmG.tsv
fi

################################################
# identifying GCm
################################################
echo "identify GCm ..."

mkdir -p ${expPath}/meth_calls/
mkdir -p ${workDir}/meth_calls/

for i in "${!chr[@]}"
do
echo ${chr[$i]}
echo ${chrIntervals[$i]}
${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -r ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${expPath}/bamFiles/${expName}_pass_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv

${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -r ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${expPath}/bamFiles/${expName}_fail_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv
echo ${chr[$i]}
echo ${chrIntervals[$i]}
nanopolish call-methylation -t 4 -q gpc -r ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${workDir}/bamFiles/${expName}_pass_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv

if [ "${doFail}" = TRUE ]
then
nanopolish call-methylation -t 4 -q gpc -r ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${workDir}/bamFiles/${expName}_fail_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv
fi
done


#### combine separate chromosomes into single file ####

# first write header
head -1 ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv
head -1 ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv
head -1 ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv
if [ "$doFail" = TRUE ]
then
head -1 ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv
fi

# then combine files
for i in "${!chr[@]}"
do
tail -n +2 ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv
tail -n +2 ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv
rm ${expPath}/meth_calls/${expName}_????_${bc}_GpCcalls_${chr[$i]}.tsv
tail -n +2 ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv
if [ "$doFail" = TRUE ]
then
tail -n +2 ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv
fi
rm ${workDir}/meth_calls/${expName}_????_${bc}_GpCcalls_${chr[$i]}.tsv
done


#### caclulating frequency ######
mkdir -p ${expPath}/meth_freq
mkdir -p ${workDir}/meth_freq

${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv > ${expPath}/meth_freq/${expName}_pass_${bc}_freqGCm.tsv
${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv > ${workDir}/meth_freq/${expName}_pass_${bc}_freqGCm.tsv

${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv > ${expPath}/meth_freq/${expName}_fail_${bc}_freqGCm.tsv
if [ "$doFail" = TRUE ]
then
${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv > ${workDir}/meth_freq/${expName}_fail_${bc}_freqGCm.tsv
fi


Loading