forked from HaploKit/Strainline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreproduce.sh
107 lines (76 loc) · 4.49 KB
/
reproduce.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/bin/bash
set -e
######### Run strainline #########
srcpath=/root/capsule/code/src
threads=16
# small example dataset
echo "Running the example dataset..."
input=/root/capsule/data/example/reads.fa
$srcpath/strainline.sh -i $input -o out -p pb -k 20 -t $threads
## Note: Running the following codes on 'Code Ocean' may be time-consuming because of limited computational resources ##
## For all datasets, we provide the final output haplotyps in 'result/' ##
if false
then
# simulated data, pacbio clr, 5-HIV
input=/root/capsule/data/simulated/clr/5-HIV/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p pb -k 50 --maxGD 0.02 --maxLD 0.01 --maxOH 50 -t $threads
## For other different sequencing coverage data (500x,1000x,2000x,5000x,10000x)
# input=/root/capsule/data/simulated/clr/5-HIV/diff_coverage/*/reads.fa.gz
# $srcpath/strainline.sh -i $input -o out -p pb -k 50 --maxGD 0.01 --maxLD 0.01 --maxOH 50 -t $threads
# simulated data, pacbio clr, 6-polio
input=/root/capsule/data/simulated/clr/6-polio/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p pb -k 100 --minIdentity 0.995 -t $threads
# simulated data, pacbio clr, 10-HCV
input=/root/capsule/data/simulated/clr/10-HCV/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p pb -k 100 --maxGD 0.02 --maxOH 50 -t $threads
# simulated data, pacbio clr, 15-ZIKV
input=/root/capsule/data/simulated/clr/15-ZIKV/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p pb -k 500 --minIdentity 0.995 --maxGD 0.005 --minAbun 0.01 -t $threads
# simulated data, pacbio clr, 5-SARSCov2
input=/root/capsule/data/simulated/clr/5-SARSCov2/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p pb -k 100 --maxLD 0.01 --minAbun 0.085 --iter 3 -t $threads
# simulated data, ont, 5-HIV
input=/root/capsule/data/simulated/ont/5-HIV/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p ont -k 20 --maxGD 0.02 --maxLD 0.01 --maxOH 50 -t $threads
# simulated data, ont , 6-polio
input=/root/capsule/data/simulated/ont/6-polio/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p ont -k 100 --maxOH 50 --maxLD 0.01 -t $threads
# simulated data, ont , 10-HCV
input=/root/capsule/data/simulated/ont/10-HCV/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p ont -k 100 --maxGD 0.02 --maxLD 0.01 --maxOH 50 -t $threads
# simulated data, ont , 15-ZIKV
input=/root/capsule/data/simulated/ont/15-ZIKV/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p ont -k 100 --minIdentity 0.995 --minAbun 0.01 -t $threads
# simulated data, ont , 5-SARSCov2
input=/root/capsule/data/simulated/ont/5-SARSCov2/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p ont -k 100 --maxLD 0.01 --minAbun 0.085 -t $threads
# real data, ont, 5-PVY
input=/root/capsule/data/real/5-PVY/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p ont -k 100 --maxGD 0.02 --maxLD 0.01 --minOvlpLen 400 --minSeedLen 2000 --minAbun 0.03 --maxOH 20 -t $threads
# real data, ont, SARSCov2 [SRA:SRP250446]
input=/root/capsule/data/real/SARSCov2/SRP250446/reads.fa.gz
$srcpath/strainline.sh -i $input -o out -p ont -k 50 --rmMisassembly True --minAbun 0.05 --minIdentity 0.98 --maxLD 0.01 -t $threads
fi
######### Run other tools #########
## Run canu --version, Canu snapshot (), installed on Nov4,2019 via conda ##
## use parameters recommended for metagenome assembly ##
## For PacBio CLR
# canu -p out -d out genomeSize=GENOMESIZE minReadLength=500 minOverlapLength=200 corOutCoverage=10000 corMhapSensitivity=high corMinCoverage=0 redMemory=32 oeaMemory=32 batMemory=200 -pacbio-raw reads.fa
## For ONT
# canu -p out -d out genomeSize=GENOMESIZE minReadLength=500 minOverlapLength=200 corOutCoverage=10000 corMhapSensitivity=high corMinCoverage=0 redMemory=32 oeaMemory=32 batMemory=200 -nanopore-raw reads.fa
## Run wtdbg2 Version: 0.0 (19830203) ##
# wtdbg2 -x rs -g GENOMESIZE -t 8 -i $fq -fo out
# wtpoa-cns -t 8 -i out.ctg.lay.gz -fo out.ctg.fa
echo The output assemblies of simulated datasets are located here:
ls /root/capsule/data/uploadedResults/simulated/*/*/haplotypes.*.fa
echo
echo the output assemblies of real datasets are located here:
ls /root/capsule/data/uploadedResults/real/*/haplotypes.*.fa
echo
echo The output assemblies of simulated datasets[5-HIV, various sequencing coverages] are located here:
ls /root/capsule/data/uploadedResults/simulated/*/5-HIV/*/haplotypes.*.fa
echo
######### Evaluation #########
## For assembly evaluation, we used the following commands ##
# quast-5.1.0rc1/metaquast.py -r ref.fa --min-contig 500 -o out --unique-mapping -t 16 contigs.fa
# cat out/runs_per_reference/ref/report.tsv