-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathshortReadsqc.wdl
317 lines (275 loc) · 8.54 KB
/
shortReadsqc.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# Short reads QC workflow
version 1.0
workflow ShortReadsQC {
input{
String container="bfoster1/img-omics:0.1.9"
String bbtools_container="microbiomedata/bbtools:38.96"
String workflow_container = "microbiomedata/workflowmeta:1.1.1"
String proj
String prefix=sub(proj, ":", "_")
Array[String] input_files
Array[String] input_fq1
Array[String] input_fq2
Boolean interleaved
String database="/refdata/"
Int rqc_mem = 180
}
if (interleaved) {
call stage_single {
input:
container = container,
input_file = input_files
}
}
if (!interleaved) {
call stage_interleave {
input:
input_fastq1 = input_fq1,
input_fastq2 = input_fq2,
container = bbtools_container,
memory = "10G"
}
}
# Estimate RQC runtime at an hour per compress GB
call rqcfilter as qc {
input:
input_fastq = if interleaved then stage_single.reads_fastq else stage_interleave.reads_fastq,
threads = "16",
database = database,
memory = rqc_mem,
container = bbtools_container
}
call make_info_file {
input:
info_file = qc.info_file,
container = container,
prefix = prefix
}
call finish_rqc {
input:
container = workflow_container,
prefix = prefix,
filtered = qc.filtered,
filtered_stats = qc.stat,
filtered_stats2 = qc.stat2
}
output {
File filtered_final = finish_rqc.filtered_final
File filtered_stats_final = finish_rqc.filtered_stats_final
File filtered_stats2_final = finish_rqc.filtered_stats2_final
File rqc_info = make_info_file.rqc_info
File stats = finish_rqc.json_out
}
}
task stage_single {
input{
String container
String target="raw.fastq.gz"
Array[String] input_file
}
command <<<
set -oeu pipefail
for file in ~{sep= ' ' input_file}; do
temp=$(basename $file)
if [ $( echo $file|egrep -c "https*:") -gt 0 ] ; then
wget $file -O $temp
else
ln -s $file $temp || cp $file $temp
fi
cat $temp >> ~{target}
done
# Capture the start time
date --iso-8601=seconds > start.txt
>>>
output{
File reads_fastq = "~{target}"
String start = read_string("start.txt")
}
runtime {
memory: "1 GiB"
cpu: 2
maxRetries: 1
docker: container
}
}
task stage_interleave {
input{
String container
String memory
String target_reads_1="raw_reads_1.fastq.gz"
String target_reads_2="raw_reads_2.fastq.gz"
String output_interleaved="raw.fastq.gz"
Array[String] input_fastq1
Array[String] input_fastq2
Int file_num = length(input_fastq1)
}
command <<<
set -oeu pipefail
# load wdl array to shell array
FQ1_ARRAY=(~{sep=" " input_fastq1})
FQ2_ARRAY=(~{sep=" " input_fastq2})
for (( i = 0; i < ~{file_num}; i++ )) ;do
fq1_name=$(basename ${FQ1_ARRAY[$i]})
fq2_name=$(basename ${FQ2_ARRAY[$i]})
if [ $( echo ${FQ1_ARRAY[$i]} | egrep -c "https*:") -gt 0 ] ; then
wget ${FQ1_ARRAY[$i]} -O $fq1_name
wget ${FQ2_ARRAY[$i]} -O $fq2_name
else
ln -s ${FQ1_ARRAY[$i]} $fq1_name || cp ${FQ1_ARRAY[$i]} $fq1_name
ln -s ${FQ2_ARRAY[$i]} $fq2_name || cp ${FQ2_ARRAY[$i]} $fq2_name
fi
cat $fq1_name >> ~{target_reads_1}
cat $fq2_name >> ~{target_reads_2}
done
reformat.sh -Xmx~{memory} in1=~{target_reads_1} in2=~{target_reads_2} out=~{output_interleaved}
# Validate that the read1 and read2 files are sorted correctly
reformat.sh -Xmx~{memory} verifypaired=t in=~{output_interleaved}
# Capture the start time
date --iso-8601=seconds > start.txt
>>>
output{
File reads_fastq = "~{output_interleaved}"
String start = read_string("start.txt")
}
runtime {
memory: "10 GiB"
cpu: 2
maxRetries: 1
docker: container
}
}
task rqcfilter {
input{
File? input_fastq
String container
String database
String rqcfilterdata = database + "/RQCFilterData"
Boolean chastityfilter_flag=true
Int memory
Int xmxmem = floor(memory * 0.85)
Int? threads
String filename_outlog="stdout.log"
String filename_errlog="stderr.log"
String filename_stat="filtered/filterStats.txt"
String filename_stat2="filtered/filterStats2.txt"
String filename_stat_json="filtered/filterStats.json"
String filename_reproduce="filtered/reproduce.sh"
String system_cpu="$(grep \"model name\" /proc/cpuinfo | wc -l)"
String jvm_threads=select_first([threads,system_cpu])
String chastityfilter= if (chastityfilter_flag) then "cf=t" else "cf=f"
}
runtime {
docker: container
memory: "~{memory} GiB"
cpu: 16
}
command<<<
export TIME="time result\ncmd:%C\nreal %es\nuser %Us \nsys %Ss \nmemory:%MKB \ncpu %P"
set -euo pipefail
rqcfilter2.sh \
~{if (defined(memory)) then "-Xmx" + xmxmem + "G" else "-Xmx60G" }\
-da \
threads=~{jvm_threads} \
~{chastityfilter} \
jni=t \
in=~{input_fastq} \
path=filtered \
rna=f \
trimfragadapter=t \
qtrim=r \
trimq=0 \
maxns=3 \
maq=3 \
minlen=51 \
mlf=0.33 \
phix=t \
removehuman=t \
removedog=t \
removecat=t \
removemouse=t \
khist=t \
removemicrobes=t \
sketch \
kapa=t \
clumpify=t \
barcodefilter=f \
trimpolyg=5 \
usejni=f \
rqcfilterdata=~{rqcfilterdata} \
> >(tee -a ~{filename_outlog}) \
2> >(tee -a ~{filename_errlog} >&2)
python <<CODE
import json
f = open("~{filename_stat}",'r')
d = dict()
for line in f:
if not line.rstrip():continue
key,value=line.rstrip().split('=')
d[key]=float(value) if 'Ratio' in key else int(value)
with open("~{filename_stat_json}", 'w') as outfile:
json.dump(d, outfile)
CODE
>>>
output {
File stdout = filename_outlog
File stderr = filename_errlog
File stat = filename_stat
File stat2 = filename_stat2
File info_file = filename_reproduce
File filtered = "filtered/raw.anqdpht.fastq.gz"
}
}
task make_info_file {
input{
File info_file
String prefix
String container
}
command<<<
set -oeu pipefail
sed -n 2,5p ~{info_file} 2>&1 | \
perl -ne 's:in=/.*/(.*) :in=$1:; s/#//; s/BBTools/BBTools(1)/; print;' > \
~{prefix}_readsQC.info
echo -e "\n(1) B. Bushnell: BBTools software package, http://bbtools.jgi.doe.gov/" >> \
~{prefix}_readsQC.info
>>>
output {
File rqc_info = "~{prefix}_readsQC.info"
}
runtime {
memory: "1 GiB"
cpu: 1
maxRetries: 1
docker: container
}
}
task finish_rqc {
input {
File filtered_stats
File filtered_stats2
File filtered
String container
String prefix
}
command<<<
set -oeu pipefail
end=`date --iso-8601=seconds`
# Generate QA objects
ln -s ~{filtered} ~{prefix}_filtered.fastq.gz
ln -s ~{filtered_stats} ~{prefix}_filterStats.txt
ln -s ~{filtered_stats2} ~{prefix}_filterStats2.txt
# Generate stats but rename some fields until the script is fixed.
/scripts/rqcstats.py ~{filtered_stats} > ~{prefix}_qa_stats.json
>>>
output {
File filtered_final = "~{prefix}_filtered.fastq.gz"
File filtered_stats_final = "~{prefix}_filterStats.txt"
File filtered_stats2_final = "~{prefix}_filterStats2.txt"
File json_out = "~{prefix}_qa_stats.json"
}
runtime {
docker: container
memory: "1 GiB"
cpu: 1
}
}