diff --git a/01_runBasecall.sh b/01_runBasecall.sh index 9dd7f1b..5bccb44 100644 --- a/01_runBasecall.sh +++ b/01_runBasecall.sh @@ -2,8 +2,8 @@ ## Allocate resources #SBATCH --time=3-00:00:00 -#SBATCH --mem-per-cpu=8G -#SBATCH --cpus-per-task=8 +#SBATCH --mem-per-cpu=16G +#SBATCH --cpus-per-task=4 ## job name #SBATCH --job-name="guppy" diff --git a/02_runAlignFastq.sh b/02_runAlignFastq.sh index 7fb2bd8..aaf9854 100644 --- a/02_runAlignFastq.sh +++ b/02_runAlignFastq.sh @@ -3,7 +3,9 @@ ## Allocate resources #SBATCH --time=2-00:00:00 #SBATCH --mem-per-cpu=8G -#SBATCH --array=4 +#SBATCH --array=1-6 +#SBATCH --mail-user=jennifer.semple@izb.unibe.ch +#SBATCH --mail-type=END,FAIL ## you should submit as many jobs as there are barcodes in barcodesOfInterest ## (don't forget to include unclassfied in barcodesOfInterst in the varSettings.sh file) diff --git a/02_runAlignFastqdb.sh b/02_runAlignFastqdb.sh new file mode 100644 index 0000000..bac581a --- /dev/null +++ b/02_runAlignFastqdb.sh @@ -0,0 +1,33 @@ +#! /bin/bash + +## Allocate resources +#SBATCH --time=2-00:00:00 +#SBATCH --mem-per-cpu=8G +#SBATCH --array=1-6 +#SBATCH --mail-user=jennifer.semple@izb.unibe.ch +#SBATCH --mail-type=END,FAIL +## you should submit as many jobs as there are barcodes in barcodesOfInterest +## (don't forget to include unclassfied in barcodesOfInterst in the varSettings.sh file) + +## job name +#SBATCH --job-name="npAlign" + +# read in the run specific settings +source ./varSettings.sh + + +let i=$SLURM_ARRAY_TASK_ID-1 + +# organise fastq, do QC plots and align with minimap +./alignFastqdb.sh ${expName} ${barcodesOfInterest[$i]} ${genomeFile} + + +# if barcode is one with a spikein, align also to other genomes +for bc in ${bcWithSpikeIn[@]} +do + if [ "${barcodesOfInterest[$i]}" == "$bc" ]; then + ./alignFastqdb_spikeIn.sh ${expName} ${barcodesOfInterest[$i]} ${lambdaFile} + ./alignFastqdb_spikeIn.sh ${expName} ${barcodesOfInterest[$i]} ${phiXfile} + exit + fi +done diff --git a/alignFastq.sh b/alignFastq.sh index 8abc5cd..15e58de 100755 --- a/alignFastq.sh +++ b/alignFastq.sh @@ -2,16 +2,14 @@ ## script to arrange all fastq files into batches for analysis. Combined files will be created for ## each barcode of interest for both passed and failed reads (e.g. pass_barcode01 or fail_barcode01). +source ./varSettings.sh # Get variables from command line expName=$1 # experiment name (date of exp usually) bc=$2 # barcode genomeFile=$3 # full path to reference genome # need absolute paths for nanopolish index. get it from the summary file. -#summaryFile=`readlink -f ../fastqFiles/sequencing_summary.txt` -expPath=`readlink -f ../` -#expPath=/data/projects/p025/Jenny/20190411_dSMFv021-025np_N2gw -summaryFile=${expPath}/fastqFiles/sequencing_summary.txt +summaryFile=${workDir}/fastqFiles/sequencing_summary.txt ####### modules to load ########## module load vital-it @@ -21,78 +19,61 @@ module add UHTS/Analysis/samtools/1.8; ############################################ -########################################################## -# function to run Pauvre and nanoQC on individual batches -########################################################## - -## activate python environment for QC programmes (Pauvre and NanoQC) -source activate albacore_env - -## function for running Pauvre and NanoQC on each combined file -do_pauvre_nanoQC() { - # get variables from arguments - bc=$1 #barcode - pf=$2 #pass or fail folder - eN=$3 #expName - qc=$4 #qcDir - fq=${expPath}/bcFastq/${eN}_${pf}_${bc}.fastq.gz #input file to qc - - # assemble names for some of the output files - outDir=${qc}/${pf}_${bc} - mkdir -p $outDir - outStats=${outDir}/pauvreStats.txt - - # run QC programmes - pauvre stats -f $fq > ${outStats} - nanoQC $fq -o $outDir -} - - -################################################ -# Collecting reads from barcodes that were used -################################################ -echo "collecting reads from folder of barcodes that were used..." - -# merge all reads from particular barcode into single file (pass fail separately) -echo ${expPath}/fast5Files - -if [ -d ${expPath}/bcFastq/pass/${bc} ]; -then - echo "passed reads..." - cat ${expPath}/bcFastq/pass/${bc}/*.fastq > ${expPath}/bcFastq/${expName}_pass_${bc}.fastq - gzip ${expPath}/bcFastq/${expName}_pass_${bc}.fastq - rm ${expPath}/bcFastq/${expName}_pass_${bc}.fastq - ${NANOPOLISH_DIR}/nanopolish index -s ${summaryFile} -d ${expPath}/fast5Files ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz - do_pauvre_nanoQC $bc pass $expName ${expPath}/qc -fi - - if [ -d ${expPath}/bcFastq/fail/${bc} ]; -then - echo "failed reads ..." - cat ${expPath}/bcFastq/fail/${bc}/*.fastq > ${expPath}/bcFastq/${expName}_fail_${bc}.fastq - gzip ${expPath}/bcFastq/${expName}_fail_${bc}.fastq - rm ${expPath}/bcFastq/${expName}_fail_${bc}.fastq - ${NANOPOLISH_DIR}/nanopolish index -s ${summaryFile} -d ${expPath}/fast5Files ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz - do_pauvre_nanoQC $bc fail $expName ${expPath}/qc -fi - - -################################################ -# aligning to genome -################################################ - -echo "aligning to genome..." - -mkdir -p ${expPath}/bamFiles - - #map reads to genome with minimap2 -# filter reads with flag=2308: unmapped (4) + secondary alignment (256) + supplementary alignment (2048) [the latter category is the main problem] -minimap2 -ax map-ont $genomeFile ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T ${expName}_pass_${bc} -o ${expPath}/bamFiles/${expName}_pass_${bc}.sorted.bam -minimap2 -ax map-ont $genomeFile ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T ${expName}_fail_${bc} -o ${expPath}/bamFiles/${expName}_fail_${bc}.sorted.bam - -echo "index bam file ..." -samtools index ${expPath}/bamFiles/${expName}_pass_${bc}.sorted.bam -samtools index ${expPath}/bamFiles/${expName}_fail_${bc}.sorted.bam +source ${HOME}/.bashrc +source ${CONDA_ACTIVATE} nanopore + + +################################################# +## Collecting reads from barcodes that were used +################################################# +#echo "collecting reads from folder of barcodes that were used..." +# +## merge all reads from particular barcode into single file (pass fail separately) +#echo ${dataDir}/fast5Files +#mkdir -p ${workDir}/qc/NanoStat +# +#if [ -d "${workDir}/bcFastq/pass/${bc}" ]; +#then +# echo "passed reads..." +# cat ${workDir}/bcFastq/pass/${bc}/*.fastq > ${workDir}/bcFastq/${expName}_pass_${bc}.fastq +# gzip ${workDir}/bcFastq/${expName}_pass_${bc}.fastq +# rm ${workDir}/bcFastq/${expName}_pass_${bc}.fastq +# nanopolish index -s ${summaryFile} -d ${dataDir}/fast5Files ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz +# mkdir -p ${workDir}/qc/pass_${bc} +# nanoQC ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz -o ${workDir}/qc/pass_${bc} +# NanoStat --fastq ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz --outdir ${workDir}/qc/NanoStat --name NanoStat_${expName}_pass_${bc}.txt --readtype 1D +#fi +# +#if [ -d "${workDir}/bcFastq/fail/${bc}" ]; +#then +# echo "failed reads ..." +# cat ${workDir}/bcFastq/fail/${bc}/*.fastq > ${workDir}/bcFastq/${expName}_fail_${bc}.fastq +# gzip ${workDir}/bcFastq/${expName}_fail_${bc}.fastq +# rm ${workDir}/bcFastq/${expName}_fail_${bc}.fastq +# nanopolish index -s ${summaryFile} -d ${dataDir}/fast5Files ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz +# mkdir -p ${workDir}/qc/fail_${bc} +# nanoQC ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz -o ${workDir}/qc/fail_${bc} +# NanoStat --fastq ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz --outdir ${workDir}/qc/NanoStat --name NanoStat_${expName}_fail_${bc}.txt --readtype 1D +#fi +# +# +################################################# +## aligning to genome +################################################# +# +#echo "aligning to genome..." +# +#mkdir -p ${workDir}/bamFiles +# +## map reads to genome with minimap2 +## filter reads with flag=2308: unmapped (4) + secondary alignment (256) + supplementary alignment (2048) [the latter category is the main problem] +#minimap2 -ax map-ont $genomeFile ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T pass_${bc} -o ${workDir}/bamFiles/${expName}_pass_${bc}.sorted.bam +#minimap2 -ax map-ont $genomeFile ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T fail_${bc} -o ${workDir}/bamFiles/${expName}_fail_${bc}.sorted.bam +# +#echo "index bam file ..." +#samtools index ${workDir}/bamFiles/${expName}_pass_${bc}.sorted.bam +#samtools index ${workDir}/bamFiles/${expName}_fail_${bc}.sorted.bam +# ################################################ @@ -111,76 +92,101 @@ chr=( `cut -f1 chrom.sizes` ) # identifying CmG ################################################ echo "identify CmG ..." +doFail=FALSE -mkdir -p ${expPath}/meth_calls/ +mkdir -p ${workDir}/meth_calls/ for i in "${!chr[@]}" do - ${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${expPath}/bamFiles/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv - - ${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${expPath}/bamFiles/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv + nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${workDir}/bamFiles/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv + if [ "$doFail" = "TRUE" ] + then + echo "analysing failed reads" + nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${workDir}/bamFiles/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv + fi done #### combine separate chromosomes into single file #### # first write header -head -1 ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv -head -1 ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv +head -1 ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv +if [ "$doFail" = TRUE ] +then + head -1 ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv +fi + # then combine files for i in "${!chr[@]}" do - tail -n +2 ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv - tail -n +2 ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv - rm ${expPath}/meth_calls/${expName}_????_${bc}_CpGcalls_${chr[$i]}.tsv + tail -n +2 ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv + if [ "$doFail" = TRUE ] + then + tail -n +2 ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv + fi + rm ${workDir}/meth_calls/${expName}_????_${bc}_CpGcalls_${chr[$i]}.tsv done #### caclulating frequency ###### -mkdir -p ${expPath}/meth_freq -${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv > ${expPath}/meth_freq/${expName}_pass_${bc}_freqCmG.tsv - -${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv > ${expPath}/meth_freq/${expName}_fail_${bc}_freqCmG.tsv +mkdir -p ${workDir}/meth_freq +${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${expName}_pass_${bc}_CpGcalls.tsv > ${workDir}/meth_freq/${expName}_pass_${bc}_freqCmG.tsv +if [ "$doFail" = TRUE ] +then + ${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${expName}_fail_${bc}_CpGcalls.tsv > ${workDir}/meth_freq/${expName}_fail_${bc}_freqCmG.tsv +fi ################################################ # identifying GCm ################################################ echo "identify GCm ..." -mkdir -p ${expPath}/meth_calls/ +mkdir -p ${workDir}/meth_calls/ for i in "${!chr[@]}" do -echo ${chr[$i]} -echo ${chrIntervals[$i]} - ${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -r ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${expPath}/bamFiles/${expName}_pass_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv - - ${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -r ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${expPath}/bamFiles/${expName}_fail_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv + echo ${chr[$i]} + echo ${chrIntervals[$i]} + nanopolish call-methylation -t 4 -q gpc -r ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${workDir}/bamFiles/${expName}_pass_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv + + if [ "${doFail}" = TRUE ] + then + nanopolish call-methylation -t 4 -q gpc -r ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${workDir}/bamFiles/${expName}_fail_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv + fi done #### combine separate chromosomes into single file #### # first write header -head -1 ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv -head -1 ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv +head -1 ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv +if [ "$doFail" = TRUE ] +then + head -1 ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv +fi # then combine files for i in "${!chr[@]}" do - tail -n +2 ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv - tail -n +2 ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv - rm ${expPath}/meth_calls/${expName}_????_${bc}_GpCcalls_${chr[$i]}.tsv + tail -n +2 ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv + if [ "$doFail" = TRUE ] + then + tail -n +2 ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv + fi + rm ${workDir}/meth_calls/${expName}_????_${bc}_GpCcalls_${chr[$i]}.tsv done #### caclulating frequency ###### -mkdir -p ${expPath}/meth_freq +mkdir -p ${workDir}/meth_freq -${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv > ${expPath}/meth_freq/${expName}_pass_${bc}_freqGCm.tsv +${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${expName}_pass_${bc}_GpCcalls.tsv > ${workDir}/meth_freq/${expName}_pass_${bc}_freqGCm.tsv -${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv > ${expPath}/meth_freq/${expName}_fail_${bc}_freqGCm.tsv +if [ "$doFail" = TRUE ] +then + ${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${expName}_fail_${bc}_GpCcalls.tsv > ${workDir}/meth_freq/${expName}_fail_${bc}_freqGCm.tsv +fi diff --git a/alignFastq_spikeIn.sh b/alignFastq_spikeIn.sh index a86a09d..a393560 100755 --- a/alignFastq_spikeIn.sh +++ b/alignFastq_spikeIn.sh @@ -9,10 +9,6 @@ genomeFile=$3 # full path to reference genome genomeName=`basename $genomeFile` genomeName=${genomeName%.fasta} -# need absolute paths for nanopolish index. get it from the summary file. -#summaryFile=`readlink -f ../fastqFiles/sequencing_summary.txt` -#expPath=`dirname $summaryFile` -expPath=`readlink -f ../` ####### modules to load ########## module load vital-it @@ -24,22 +20,24 @@ module add UHTS/Analysis/samtools/1.8; -################################################ -# aligning to genome -################################################ - -echo "aligning to genome..." - -mkdir -p ../bamFiles/${genomeName} -# map reads to genome with minimap2 -# filter reads with flag=2308: unmapped (4) + secondary alignment (256) + supplementary alignment (2048) [the latter category is the main problem] -minimap2 -ax map-ont $genomeFile ../bcFastq/${expName}_pass_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T ${genomeName}_${expName}_pass_${bc} -o ../bamFiles/${genomeName}/${expName}_pass_${bc}.sorted.bam -minimap2 -ax map-ont $genomeFile ../bcFastq/${expName}_fail_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T ${genomeName}_${expName}_pass_${bc} -o ../bamFiles/${genomeName}/${expName}_fail_${bc}.sorted.bam +################################################# +## aligning to genome +################################################# +# +#echo "aligning to genome..." +# +#mkdir -p ../bamFiles/${genomeName} +# +## map reads to genome with minimap2 +## filter reads with flag=2308: unmapped (4) + secondary alignment (256) + supplementary alignment (2048) [the latter category is the main problem] +#minimap2 -ax map-ont $genomeFile ../bcFastq/${expName}_pass_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T tmp -o ../bamFiles/${genomeName}/${expName}_pass_${bc}.sorted.bam +#minimap2 -ax map-ont $genomeFile ../bcFastq/${expName}_fail_${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T tmp -o ../bamFiles/${genomeName}/${expName}_fail_${bc}.sorted.bam +# +#echo "index bam file ..." +#samtools index ../bamFiles/${genomeName}/${expName}_pass_${bc}.sorted.bam +#samtools index ../bamFiles/${genomeName}/${expName}_fail_${bc}.sorted.bam -echo "index bam file ..." -samtools index ../bamFiles/${genomeName}/${expName}_pass_${bc}.sorted.bam -samtools index ../bamFiles/${genomeName}/${expName}_fail_${bc}.sorted.bam ################################################ @@ -63,32 +61,32 @@ echo "identify CmG ..." mkdir -p ../meth_calls/${genomeName} for i in "${!chr[@]}" do -${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${expPath}/bamFiles/${genomeName}/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv +${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${workDir}/bamFiles/${genomeName}/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv -${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${expPath}/bamFiles/${genomeName}/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv +#${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${workDir}/bamFiles/${genomeName}/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv done #### combine separate chromosomes into single file #### # first write header -head -1 ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls.tsv -head -1 ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls.tsv +head -1 ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls.tsv +#head -1 ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls.tsv # then combine files for i in "${!chr[@]}" do - tail -n +2 ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls.tsv - tail -n +2 ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls.tsv - rm ${expPath}/meth_calls/${genomeName}/${expName}_????_${bc}_CpGcalls_${chr[$i]}.tsv + tail -n +2 ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls.tsv +# tail -n +2 ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls.tsv + rm ${workDir}/meth_calls/${genomeName}/${expName}_????_${bc}_CpGcalls_${chr[$i]}.tsv done #### caclulating frequency ###### mkdir -p ../meth_freq/${genomeName} -${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls.tsv > ${expPath}/meth_freq/${genomeName}/${expName}_pass_${bc}_freqCmG.tsv +${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_CpGcalls.tsv > ${workDir}/meth_freq/${genomeName}/${expName}_pass_${bc}_freqCmG.tsv -${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls.tsv > ${expPath}/meth_freq/${genomeName}/${expName}_fail_${bc}_freqCmG.tsv +#${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_CpGcalls.tsv > ${workDir}/meth_freq/${genomeName}/${expName}_fail_${bc}_freqCmG.tsv @@ -100,30 +98,30 @@ echo "identify GCm ..." mkdir -p ../meth_calls/${genomeName} for i in "${!chr[@]}" do -${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -w ${chrIntervals[$i]} -r ${expPath}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${expPath}/bamFiles/${genomeName}/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv +${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -w ${chrIntervals[$i]} -r ${workDir}/bcFastq/${expName}_pass_${bc}.fastq.gz -b ${workDir}/bamFiles/${genomeName}/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv -${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -w ${chrIntervals[$i]} -r ${expPath}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${expPath}/bamFiles/${genomeName}/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv +#${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -w ${chrIntervals[$i]} -r ${workDir}/bcFastq/${expName}_fail_${bc}.fastq.gz -b ${workDir}/bamFiles/${genomeName}/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv done #### combine separate chromosomes into single file #### # first write header -head -1 ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls.tsv -head -1 ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls_${chr[0]}.tsv > ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls.tsv +head -1 ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls.tsv +#head -1 ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls.tsv # then combine files for i in "${!chr[@]}" do - tail -n +2 ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls.tsv - tail -n +2 ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv >> ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls.tsv - rm ${expPath}/meth_calls/${genomeName}/${expName}_????_${bc}_GpCcalls_${chr[$i]}.tsv + tail -n +2 ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls.tsv +# tail -n +2 ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls.tsv + rm ${workDir}/meth_calls/${genomeName}/${expName}_????_${bc}_GpCcalls_${chr[$i]}.tsv done #### caclulating frequency ###### mkdir -p ../meth_freq/${genomeName} -${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls.tsv > ${expPath}/meth_freq/${genomeName}/${expName}_pass_${bc}_freqGCm.tsv +${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${genomeName}/${expName}_pass_${bc}_GpCcalls.tsv > ${workDir}/meth_freq/${genomeName}/${expName}_pass_${bc}_freqGCm.tsv -${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${expPath}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls.tsv > ${expPath}/meth_freq/${genomeName}/${expName}_fail_${bc}_freqGCm.tsv +#${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_calls/${genomeName}/${expName}_fail_${bc}_GpCcalls.tsv > ${workDir}/meth_freq/${genomeName}/${expName}_fail_${bc}_freqGCm.tsv diff --git a/alignFastqdb.sh b/alignFastqdb.sh new file mode 100644 index 0000000..1ece788 --- /dev/null +++ b/alignFastqdb.sh @@ -0,0 +1,183 @@ +#! /bin/bash +## script to arrange all fastq files into batches for analysis. Combined files will be created for +## each barcode of interest for both passed and failed reads (e.g. pass_barcode01 or fail_barcode01). + +source ./varSettings.sh +# Get variables from command line +expName=$1 # experiment name (date of exp usually) +bc=$2 # barcode +genomeFile=$3 # full path to reference genome + +# need absolute paths for nanopolish index. get it from the summary file. +summaryFile=${workDir}/fastqFiles/sequencing_summary.txt + +####### modules to load ########## +module load vital-it +#module load R/3.5.1 +module add UHTS/Analysis/minimap2/2.12; +module add UHTS/Analysis/samtools/1.8; +############################################ + +source ${HOME}/.bashrc +source ${CONDA_ACTIVATE} nanopore + + +################################################# +## Collecting reads from barcodes that were used +################################################# +#echo "collecting reads from folder of barcodes that were used..." +# +## merge all reads from particular barcode into single file (pass fail separately) +#echo ${dataDir}/fast5Files +#mkdir -p ${workDir}/qc/NanoStatdb +# +#if [ -f ${workDir}/dbbcFastq/pass/${bc}.fastq.gz ]; +#then +# echo "passed reads..." +# nanopolish index -s ${summaryFile} -d ${dataDir}/fast5Files ${workDir}/dbbcFastq/pass/${bc}.fastq.gz +# mkdir -p ${workDir}/qc/db_pass_${bc} +# nanoQC ${workDir}/dbbcFastq/pass/${bc}.fastq.gz -o ${workDir}/qc/db_pass_${bc} +# NanoStat --fastq ${workDir}/dbbcFastq/pass/${bc}.fastq.gz --outdir ${workDir}/qc/NanoStatdb --name NanoStat_${expName}_pass_${bc}.txt --readtype 1D +#fi +# +#if [ -f ${workDir}/dbbcFastq/fail/${bc}.fastq.gz ]; +#then +# echo "failed reads ..." +# nanopolish index -s ${summaryFile} -d ${dataDir}/fast5Files ${workDir}/dbbcFastq/fail/${bc}.fastq.gz +# mkdir -p ${workDir}/qc/db_fail_${bc} +# nanoQC ${workDir}/dbbcFastq/fail/${bc}.fastq.gz -o ${workDir}/qc/db_fail_${bc} +# NanoStat --fastq ${workDir}/dbbcFastq/fail/${bc}.fastq.gz --outdir ${workDir}/qc/NanoStatdb --name NanoStat_${expName}_fail_${bc}.txt --readtype 1D +#fi +# +# +################################################# +## aligning to genome +################################################# +# +#echo "aligning to genome..." +# +#mkdir -p ${workDir}/bamFilesdb +# +## map reads to genome with minimap2 +## filter reads with flag=2308: unmapped (4) + secondary alignment (256) + supplementary alignment (2048) [the latter category is the main problem] +#minimap2 -ax map-ont $genomeFile ${workDir}/dbbcFastq/pass/${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T pass_${bc} -o ${workDir}/bamFilesdb/${expName}_pass_${bc}.sorted.bam +#minimap2 -ax map-ont $genomeFile ${workDir}/dbbcFastq/fail/${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T fail_${bc} -o ${workDir}/bamFilesdb/${expName}_fail_${bc}.sorted.bam +# +#echo "index bam file ..." +#samtools index ${workDir}/bamFilesdb/${expName}_pass_${bc}.sorted.bam +#samtools index ${workDir}/bamFilesdb/${expName}_fail_${bc}.sorted.bam + + +################################################ +# getting chr intervals +################################################ +if [ ! -f "chrom.sizes" ] +then + faidx $genomeFile -i chromsizes > chrom.sizes +fi + +chrIntervals=( `cut -f1,2 chrom.sizes | sed $'s/\t/:1-/g'` ) +chr=( `cut -f1 chrom.sizes` ) + + +################################################ +# identifying CmG +################################################ +echo "identify CmG ..." +doFail=FALSE + +mkdir -p ${workDir}/meth_callsdb/ + +for i in "${!chr[@]}" +do + nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/dbbcFastq/pass/${bc}.fastq.gz -b ${workDir}/bamFilesdb/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_callsdb/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv + if [ "${doFail}" = "TRUE" ] + then + echo "processing failed reads" + nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/dbbcFastq/fail/${bc}.fastq.gz -b ${workDir}/bamFilesdb/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_callsdb/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv + fi +done + + +#### combine separate chromosomes into single file #### + +# first write header +head -1 ${workDir}/meth_callsdb/${expName}_pass_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_callsdb/${expName}_pass_${bc}_CpGcalls.tsv +if [ "${doFail}" = "TRUE" ] +then + head -1 ${workDir}/meth_callsdb/${expName}_fail_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_callsdb/${expName}_fail_${bc}_CpGcalls.tsv +fi + + +# then combine files +for i in "${!chr[@]}" +do + tail -n +2 ${workDir}/meth_callsdb/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_callsdb/${expName}_pass_${bc}_CpGcalls.tsv + if [ "${doFail}" = "TRUE" ] + then + tail -n +2 ${workDir}/meth_callsdb/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_callsdb/${expName}_fail_${bc}_CpGcalls.tsv + fi + rm ${workDir}/meth_callsdb/${expName}_????_${bc}_CpGcalls_${chr[$i]}.tsv +done + + +#### caclulating frequency ###### +mkdir -p ${workDir}/meth_freqdb +${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_callsdb/${expName}_pass_${bc}_CpGcalls.tsv > ${workDir}/meth_freqdb/${expName}_pass_${bc}_freqCmG.tsv + +if [ "${doFail}" = "TRUE" ] +then + ${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_callsdb/${expName}_fail_${bc}_CpGcalls.tsv > ${workDir}/meth_freqdb/${expName}_fail_${bc}_freqCmG.tsv +fi + +################################################ +# identifying GCm +################################################ +echo "identify GCm ..." + +mkdir -p ${workDir}/meth_callsdb/ + +for i in "${!chr[@]}" +do + echo ${chr[$i]} + echo ${chrIntervals[$i]} + nanopolish call-methylation -t 4 -q gpc -r ${workDir}/dbbcFastq/pass/${bc}.fastq.gz -b ${workDir}/bamFilesdb/${expName}_pass_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${workDir}/meth_callsdb/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv + + if [ "${doFail}" = "TRUE" ] + then + nanopolish call-methylation -t 4 -q gpc -r ${workDir}/dbbcFastq/fail/${bc}.fastq.gz -b ${workDir}/bamFilesdb/${expName}_fail_${bc}.sorted.bam -g $genomeFile -w ${chrIntervals[$i]} > ${workDir}/meth_callsdb/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv + fi +done + + +#### combine separate chromosomes into single file #### + +# first write header +head -1 ${workDir}/meth_callsdb/${expName}_pass_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_callsdb/${expName}_pass_${bc}_GpCcalls.tsv +if [ "${doFail}" = "TRUE" ] +then + head -1 ${workDir}/meth_callsdb/${expName}_fail_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_callsdb/${expName}_fail_${bc}_GpCcalls.tsv +fi + +# then combine files +for i in "${!chr[@]}" +do + tail -n +2 ${workDir}/meth_callsdb/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_callsdb/${expName}_pass_${bc}_GpCcalls.tsv + if [ "${doFail}" = "TRUE" ] + then + tail -n +2 ${workDir}/meth_callsdb/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_callsdb/${expName}_fail_${bc}_GpCcalls.tsv + fi + rm ${workDir}/meth_callsdb/${expName}_????_${bc}_GpCcalls_${chr[$i]}.tsv +done + + +#### caclulating frequency ###### +mkdir -p ${workDir}/meth_freqdb + +${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_callsdb/${expName}_pass_${bc}_GpCcalls.tsv > ${workDir}/meth_freqdb/${expName}_pass_${bc}_freqGCm.tsv + +if [ "${doFail}" = "TRUE" ] +then + ${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_callsdb/${expName}_fail_${bc}_GpCcalls.tsv > ${workDir}/meth_freqdb/${expName}_fail_${bc}_freqGCm.tsv +fi + diff --git a/alignFastqdb_spikeIn.sh b/alignFastqdb_spikeIn.sh new file mode 100644 index 0000000..5221b3d --- /dev/null +++ b/alignFastqdb_spikeIn.sh @@ -0,0 +1,126 @@ +#! /bin/bash +## script to arrange all fastq files into batches for analysis. Combined files will be created for +## each barcode of interest for both passed and failed reads (e.g. pass_barcode01 or fail_barcode01). + +# Get variables from command line +expName=$1 # experiment name (date of exp usually) +bc=$2 # barcode +genomeFile=$3 # full path to reference genome +genomeName=`basename $genomeFile` +genomeName=${genomeName%.fasta} + + +####### modules to load ########## +module load vital-it +#module load R/3.5.1 +module add UHTS/Analysis/minimap2/2.12; +module add UHTS/Analysis/samtools/1.8; +############################################ + + + + +################################################# +## aligning to genome +################################################# +# +#echo "aligning to genome..." +# +#mkdir -p ../bamFilesdb/${genomeName} +# +## map reads to genome with minimap2 +## filter reads with flag=2308: unmapped (4) + secondary alignment (256) + supplementary alignment (2048) [the latter category is the main problem] +#minimap2 -ax map-ont $genomeFile ../dbbcFastq/pass/${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T tmp -o ../bamFilesdb/${genomeName}/${expName}_pass_${bc}.sorted.bam +#minimap2 -ax map-ont $genomeFile ../dbbcFastq/fail/${bc}.fastq.gz | samtools view -F 2308 -b | samtools sort -T tmp -o ../bamFilesdb/${genomeName}/${expName}_fail_${bc}.sorted.bam +# +#echo "index bam file ..." +#samtools index ../bamFilesdb/${genomeName}/${expName}_pass_${bc}.sorted.bam +#samtools index ../bamFilesdb/${genomeName}/${expName}_fail_${bc}.sorted.bam + + +################################################ +# getting chr intervals +################################################ +if [ ! -f "${genomeName}.chrom.sizes" ] +then + faidx $genomeFile -i chromsizes > ${genomeName}.chrom.sizes +fi + +chrIntervals=( `cut -f1,2 ${genomeName}.chrom.sizes | sed $'s/\t/:1-/g'` ) +chr=( `cut -f1 ${genomeName}.chrom.sizes` ) + + + +################################################ +# identifying CmG +################################################ +echo "identify CmG ..." + +mkdir -p ../meth_callsdb/${genomeName} +for i in "${!chr[@]}" +do +${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/dbbcFastq/pass/${bc}.fastq.gz -b ${workDir}/bamFilesdb/${genomeName}/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv + +#${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q cpg -w ${chrIntervals[$i]} -r ${workDir}/dbbcFastq/fail/${bc}.fastq.gz -b ${workDir}/bamFilesdb/${genomeName}/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv +done + + +#### combine separate chromosomes into single file #### + +# first write header +head -1 ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_CpGcalls.tsv +#head -1 ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_CpGcalls_${chr[0]}.tsv > ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_CpGcalls.tsv + +# then combine files +for i in "${!chr[@]}" +do + tail -n +2 ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_CpGcalls.tsv + # tail -n +2 ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_CpGcalls_${chr[$i]}.tsv >> ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_CpGcalls.tsv + rm ${workDir}/meth_callsdb/${genomeName}/${expName}_????_${bc}_CpGcalls_${chr[$i]}.tsv +done + + +#### caclulating frequency ###### +mkdir -p ../meth_freq/${genomeName} +${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_CpGcalls.tsv > ${workDir}/meth_freqdb/${genomeName}/${expName}_pass_${bc}_freqCmG.tsv + +#${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_CpGcalls.tsv > ${workDir}/meth_freqdb/${genomeName}/${expName}_fail_${bc}_freqCmG.tsv + + + +################################################ +# identifying GCm +################################################ +echo "identify GCm ..." + +mkdir -p ../meth_callsdb/${genomeName} +for i in "${!chr[@]}" +do +${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -w ${chrIntervals[$i]} -r ${workDir}/dbbcFastq/pass/${bc}.fastq.gz -b ${workDir}/bamFilesdb/${genomeName}/${expName}_pass_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv + +#${NANOPOLISH_DIR}/nanopolish call-methylation -t 4 -q gpc -w ${chrIntervals[$i]} -r ${workDir}/dbbcFastq/fail/${bc}.fastq.gz -b ${workDir}/bamFilesdb/${genomeName}/${expName}_fail_${bc}.sorted.bam -g $genomeFile > ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv +done + + +#### combine separate chromosomes into single file #### + +# first write header +head -1 ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_GpCcalls.tsv +#head -1 ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_GpCcalls_${chr[0]}.tsv > ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_GpCcalls.tsv + +# then combine files +for i in "${!chr[@]}" +do + tail -n +2 ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_GpCcalls.tsv + # tail -n +2 ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_GpCcalls_${chr[$i]}.tsv >> ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_GpCcalls.tsv + rm ${workDir}/meth_callsdb/${genomeName}/${expName}_????_${bc}_GpCcalls_${chr[$i]}.tsv +done + + + +#### caclulating frequency ###### +mkdir -p ../meth_freqdb/${genomeName} +${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_callsdb/${genomeName}/${expName}_pass_${bc}_GpCcalls.tsv > ${workDir}/meth_freqdb/${genomeName}/${expName}_pass_${bc}_freqGCm.tsv + +#${NANOPOLISH_DIR}/scripts/calculate_methylation_frequency.py -i ${workDir}/meth_callsdb/${genomeName}/${expName}_fail_${bc}_GpCcalls.tsv > ${workDir}/meth_freqdb/${genomeName}/${expName}_fail_${bc}_freqGCm.tsv + diff --git a/basecallAlbacore.sh b/basecallAlbacore.sh deleted file mode 100755 index 203c98f..0000000 --- a/basecallAlbacore.sh +++ /dev/null @@ -1,33 +0,0 @@ -#! /bin/bash -## script to basecall all fast5 in the folder one level up called fast5files and its recursive directories with albacore -## uses barcoding -## outputs 200,000 reads per fastq file (-q option) -## does not use batch subdirectories (-n 0 option) -## output will be in e.g: ../workspace/pass/barcode07/firstfastq.fastq - - -source activate albacore_env - -read_fast5_basecaller.py --flowcell FLO-MIN106 --kit SQK-LSK108 --barcoding --output_format fastq -n 0 -q 200000 --input ../fast5files/ --recursive --save_path ../ --worker_threads 8 - -################## -# run MinIONQC -################# - -module load vital-it -module load R/3.5.1 - -# create qc output directory -qcDir=../fastqQC -mkdir -p $qcDir - -echo "doing MinIONQC..." -# run Minion_qc -# source: https://github.com/roblanf/minion_qc - -source ./varSettings.sh -mkdir -p ${qcDir}/MinIONQC - -Rscript ${MINIONQC} -i ../sequencing_summary.txt -o $qcDir - -mv ../*.png ${qcDir}/MinIONQC/ diff --git a/basecallGuppy.sh b/basecallGuppy.sh index 21eb720..7a53aee 100755 --- a/basecallGuppy.sh +++ b/basecallGuppy.sh @@ -1,43 +1,83 @@ #! /bin/bash -## script to basecall all fast5 in the folder one level up called fast5files and its recursive directories with Guppy +## script to basecall all fast5 in the folder called fast5files and its recursive directories with Guppy ## .bashrc should include location of bin directory inside Guppy directory and export it as ${GUPPYDIR} ## uses barcoding -## outputs 200,000 reads per fastq file (-q option) -## output will be in e.g: ../fastqFiles/pass/firstfastq.fastq -## next the files are sorted by barcode and output to e.g ../bcFastq/pass/barcodeXX/firstfastq.fastq -## pycoQC is run and results are in ../qc/pycoQC/pycoQC.html +## output will be in e.g: ./fastqFiles/pass/firstfastq.fastq +## next the files are sorted by barcode and output to e.g ./bcFastq/pass/barcodeXX.fastq.gz +## pycoQC is run and results are in ./qc/pycoQC/pycoQC.html source ./varSettings.sh -fullPath=`readlink -f ../` -${GUPPYDIR}/guppy_basecaller --input_path ${fullPath}/fast5Files --save_path ${fullPath}/fastqFiles --flowcell FLO-MIN106 --kit SQK-LSK109 --records_per_fastq 200000 --recursive --cpu_threads_per_caller 8 --qscore_filtering --min_qscore 3 -#--num_callers -#basecall passed files -mkdir -p ../bcFastq/pass -${GUPPYDIR}/guppy_barcoder -i ${fullPath}/fastqFiles/pass -s ${fullPath}/bcFastq/pass --barcode_kits EXP-NBD104 --worker_threads 8 --recursive -q 200000 +################## +# basecall +################# + + +#guppy_basecaller --input_path ${dataDir}/fast5Files --save_path ${workDir}/fastqFiles --flowcell FLO-MIN106 --kit SQK-LSK109 --records_per_fastq 200000 --recursive --qscore_filtering --min_qscore 3 --device auto -#basecall failed files -mkdir -p ../bcFastq/fail -${GUPPYDIR}/guppy_barcoder -i ${fullPath}/fastqFiles/fail -s ${fullPath}/bcFastq/fail --barcode_kits EXP-NBD104 --worker_threads 8 --recursive -q 200000 -#rm ../fastqFiles +################## +# call barcodes +################# + +deepbinner classify --native ${dataDir}/fast5Files > ${workDir}/classifications +################## +# bin by barcode +################## + +mkdir -p ${workDir}/dbbcFastq/pass +mkdir -p ${workDir}/dbbcFastq/fail + +cat ${workDir}/fastqFiles/pass/* > ${workDir}/fastqFiles/pass/passed.fq +cat ${workDir}/fastqFiles/fail/* > ${workDir}/fastqFiles/fail/failed.fq + +deepbinner classify --native ${dataDir}/fast5Files > ${workDir}/classifications + + +deepbinner bin --classes ${workDir}/classifications --reads ${workDir}/fastqFiles/pass/passed.fq --out_dir ${workDir}/dbbcFastq/pass +deepbinner bin --classes ${workDir}/classifications --reads ${workDir}/fastqFiles/fail/failed.fq --out_dir ${workDir}/dbbcFastq/fail + +#rm ${workDir}/fastqFiles/pass/passed.fq +#rm ${workDir}/fastqFiles/fail/failed.fq + + + +################### +## call barcodes +################## + +#mkdir -p ${workDir}/bcFastq/pass +#guppy_barcoder -i ${workDir}/fastqFiles/pass -s ${workDir}/bcFastq/pass --barcode_kits EXP-NBD104 --device auto --recursive + + +#mkdir -p ${workDir}/bcFastq/fail +#guppy_barcoder -i ${workDir}/fastqFiles/fail -s ${workDir}/bcFastq/fail --barcode_kits EXP-NBD104 --device auto --recursive + ################## # run pycoQC ################# -source activate pycoQC +#source ${HOME}/.bashrc +#source ${CONDA_ACTIVATE} +#conda activate pycoQC # create qc output directory -qcDir=../qc -mkdir -p $qcDir/pycoQC +qcDir=${workDir}/qc +mkdir -p ${qcDir}/pycoQC #https://github.com/a-slide/pycoQC -pycoQC -f ${fullPath}/fastqFiles/sequencing_summary.txt -b ${fullPath}/bcFastq/pass/barcoding_summary.txt -o ${qcDir}/pycoQC/pycoQC_${expName}_pass.html --title "nanoDSMF "${expName}" passed reads" --min_pass_qual 3 +#pycoQC --summary_file ${workDir}/fastqFiles/sequencing_summary.txt --barcode_file ${workDir}/bcFastq/pass/barcoding_summary.txt -o ${qcDir}/pycoQC/pycoQC_${expName}_pass.html --min_pass_qual 3 + + +#pycoQC --summary_file ${workDir}/fastqFiles/sequencing_summary.txt --barcode_file ${workDir}/bcFastq/fail/barcoding_summary.txt -o ${qcDir}/pycoQC/pycoQC_${expName}_fail.html + +#conda deactivate + +#pycoQC --summary_file ${workDir}/fastqFiles/sequencing_summary.txt --barcode_file ${workDir}/bcFastq/pass/barcoding_summary.txt -o ${qcDir}/pycoQC/pycoQC_${expName}_pass.html --min_pass_qual 3 -pycoQC -f ${fullPath}/fastqFiles/sequencing_summary.txt -b ${fullPath}/bcFastq/fail/barcoding_summary.txt -o ${qcDir}/pycoQC/pycoQC_${expName}_fail.html --title "nanoDSMF "${expName}" failed reads" --min_pass_qual 3 -source deactivate +pycoQC --summary_file ${workDir}/fastqFiles/sequencing_summary.txt -o ${qcDir}/pycoQC/pycoQC_${expName}.html diff --git a/varSettings_example.sh b/varSettings_example.sh index a366708..ff18e6f 100644 --- a/varSettings_example.sh +++ b/varSettings_example.sh @@ -1,16 +1,29 @@ #! /bin/bash # name of experiment -expName="20171027" +expName="20190411" # barcodes used in this experiment -barcodesOfInterest=( barcode05 barcode06 barcode07 barcode08 ) +barcodesOfInterest=( barcode01 barcode02 barcode03 barcode04 barcode05 unclassified ) +# barcodes with non C.elegans DNA spike in +bcWithSpikeIn=( barcode01 barcode05 unclassified ) + # location of reference genome file -genomeFile=/data/projects/p025/Jenny/genomeVer/PCR_wPM28_32/PCR_wPM28_32.fasta +#genomeFile=/data/projects/p025/Jenny/genomeVer/PCR_wPM28_32/PCR_wPM28_32.fasta +genomeFile=/mnt/imaging.data/jsemple/genomeVer/ws265/c_elegans.PRJNA13758.WS265.genomic.fa + +# location of reference genomes for spiked-in DNA +#lambdaFile="/home/ubelix/izb/semple/genomeVer/lambda/lambdaPhage.fasta" +#phiXfile="home/ubelix/izb/semple/genomeVer/phiX/phiX.fasta" +lambdaFile=/mnt/imaging.data/jsemple/genomeVer/lambda/lambdaPhage.fasta +phiXfile=/mnt/imaging.data/jsemple/genomeVer/phiX/phiX.fasta + +dataDir=/mnt/imaging.data/jsemple/20190411_dSMFv021-025np_N2gw +workDir=/home/jsemple/20190411_dSMFv021-025np_N2gw # location of MinION program -MINIONQC=/home/pmeister/software/MinIONQC.R +#MINIONQC=/home/pmeister/software/MinIONQC.R ########## don't edit below this line ############### # note: the .bashrc should point to the location of nanopolish with the variable