-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathrun_QC_analysis.py
150 lines (133 loc) · 6.72 KB
/
run_QC_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
from __future__ import absolute_import
import sys, os, yaml, glob
import subprocess
import argparse
import re
from sciLifeLab_utils import submit_job
def main(args):
projectFolder = os.getcwd()
samples_data_dir = args.sample_data_dir
projectName = os.path.basename(os.path.normpath(samples_data_dir))
for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \
if os.path.isdir(os.path.join(samples_data_dir, dir))]:
sample_folder = os.path.join(os.getcwd(), sample_dir_name)
if not os.path.exists(sample_folder):
os.makedirs(sample_folder)
os.chdir(sample_folder)
# now I am in the folder, i can run at the same time QC and MP anlaysis
pipeline = "QCcontrol"
tools = ["trimmomatic", "fastqc", "abyss", "align"]
if args.reference is None:
tools = ["trimmomatic", "fastqc", "abyss"]
sample_YAML_name = os.path.join(sample_folder, "{}_{}.yaml".format(
sample_dir_name, pipeline))
sample_YAML = open(sample_YAML_name, 'w')
sample_YAML.write("pipeline:\n")
sample_YAML.write(" {}\n".format(pipeline))
sample_YAML.write("tools:\n")
sample_YAML.write(" {}\n".format(tools))
##TODO: output must became sampleName
sample_YAML.write("output: {}\n".format(sample_dir_name))
sample_YAML.write("projectName: {}\n".format(projectName))
sample_YAML.write("kmer: 35\n")
sample_YAML.write("threads: {}\n".format(args.threads))
sample_YAML.write("genomeSize: \n")
sample_YAML.write("adapters: {}\n".format(args.adapter))
if args.reference is not None:
sample_YAML.write("reference: {}\n".format(args.reference))
sample_YAML.write("libraries:\n")
sample_data_dir = os.path.join(samples_data_dir,sample_dir_name)
# helper variables for collecting FCs
fc_pat, prep_pat = (r'^\d{6}_.*_?.*$', r'^[A-Z]$')
def _get_expected_dir(path, pat):
return [os.path.join(path, d) for d in os.listdir(path) if re.match(pat, d) \
and os.path.isdir(os.path.join(path, d))]
#collect FC directories
flowcells_dirs = _get_expected_dir(sample_data_dir, fc_pat)
# to adapt the directory structure in IRMA where it have lib prep dir
lib_prep_dirs = _get_expected_dir(sample_data_dir, prep_pat)
# Check and collect the flowcells in the lib prep directory
for prep_dir in lib_prep_dirs:
flowcells_dirs.extend(_get_expected_dir(prep_dir, fc_pat))
sample_files = []
for flowcell in flowcells_dirs:
sample_files.extend([os.path.join(flowcell, f) for f in \
os.listdir(flowcell) \
if (os.path.isfile(os.path.realpath(os.path.join(flowcell,f))) \
and re.search('.gz$',f))])
# now sample_files contains all the file sequenced for this sample
pair1_file = ""
pair2_file = ""
single = ""
library = 1
while len(sample_files) > 0:
file = sample_files[0]
sample_YAML.write(" lib{}:\n".format(library))
if "_1.fastq.gz" in file:
pair1_file = file
pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file)
elif "_2.fastq.gz" in file:
pair2_file = file
pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file)
elif "R1_001.fastq.gz" in file:
pair1_file = file
pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file)
elif "R2_001.fastq.gz" in file:
pair2_file = file
pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file)
else:
sys.exit("file {} does not respect naming convection. \
Exit!".format(file))
sample_YAML.write(" pair1: {}\n".format(pair1_file))
sample_YAML.write(" pair2: {}\n".format(pair2_file))
sample_YAML.write(" orientation: {}\n".format(args.orientation))
sample_YAML.write(" insert: {}\n".format(args.insert))
sample_YAML.write(" std: {}\n".format(args.std))
sample_files.remove(pair1_file)
sample_files.remove(pair2_file)
library += 1
sample_YAML.close
# Run the job
extramodules = []
if "abyss" in tools:
extramodules.append("module load abyss/1.3.5\n")
if "align" in tools:
extramodules.append("module load samtools\nmodule load bwa\n")
jobname = "{}_{}".format(sample_dir_name, pipeline)
submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules)
os.chdir(projectFolder)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--reference', type=str, default=None,
help="path to the reference file")
parser.add_argument('--adapter', type=str, required=True,
help="path to the file containing the adaptor sequence to be removed")
parser.add_argument('--global-config', type=str, required=True,
help="global configuration file")
parser.add_argument('--sample-data-dir', type=str, required=True,
help=("Path to directory (usually INBOX) containing the project "
"(one dir per sample, scilife structure project/sample/flowcell/)"))
parser.add_argument('--orientation', type=str, required=True,
help="orientation of the libraries")
parser.add_argument('--insert', type=str, required=True,
help="expected insert size of the libraries")
parser.add_argument('--std', type=str, required=True,
help=("expected stdandard variation of the insert size of "
"the libraries"))
parser.add_argument('--env', type=str,
default="DeNovoPipeline", help=("name of the virtual enviorment "
"(default is DeNovoPipeline)"))
parser.add_argument('--email', type=str,
help=("Send notifications/job status updates to this email "
"address."))
parser.add_argument('--time', type=str, default="1-00:00:00",
help="required time for the job (default is 1 day : 1-00:00:00)")
parser.add_argument('--project', type=str, default="a2010002",
help="project name for slurm submission (default is a2010002)")
parser.add_argument('--threads', type=int, default=16,
help="Number of thread the job will require")
parser.add_argument('--qos', type=str,
help=("Specify a quality of service preset for the job (eg. "
"--qos short)"))
args = parser.parse_args()
main(args)