-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgroupPCRproducts.py
139 lines (118 loc) · 5.34 KB
/
groupPCRproducts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
## Check the PCR product results for overlapping resutls
## Currently if a region is good, multiple potential PCR primer regions are often returned for that region.
## With this script I want to exclude subotpimal PCR primer regions in close proximity
import argparse
from operator import itemgetter
# Instantiate the parser
parser = argparse.ArgumentParser(description='Extract only optimal PCR primer region per region')
parser.add_argument('--pcrProductfile',
help='Text file containing the output of diffMONT; a tab separated file containing informations about all PCR product regions')
def merge_overlappingIntervals(interval_list):
merged = []
## Sort the list by the first element
# sortedList = sorted(interval_list, key = itemgetter(0))
sortedList = sorted(interval_list, key=lambda tup: tup[0])
for higher in sortedList:
if not merged:
## always add the first interval
merged.append(higher)
else:
lower = merged[-1]
# test for intersection between lower and higher:
# we know via sorting that lower[0] <= higher[0]
if higher[0] <= lower[1]:
upper_bound = max(lower[1], higher[1])
merged[-1] = [lower[0], upper_bound] # replace by merged interval
## add new element if no overlap exists
else:
merged.append(higher)
return merged
# testList = [[1,3], [5,9], [2,3], [5,9], [4,5], [9,10]]
# print(merge_overlappingIntervals(testList))
# testList = [[1,3], [2,3], [4,5], [5,9], [9,10]]
## Initialize variables
chro_dict = {}
chro_dict_merged = {}
resList = []
## Extract the information from input file
args = parser.parse_args()
pcrProductfile = args.pcrProductfile
## 1. Create a dict with all overlapping regions
with open(pcrProductfile, 'r') as infile:
for line in infile:
if line.startswith("chromosome"):
continue
else:
# print(line)
# print(line.split())
chro = line.split()[0]
start = int(line.split()[2])
end = int(line.split()[8])
# igv_koords = chro + ':' + str(start) + '-' + str(end)
# line = line.strip() + '\t' + igv_koords
if not (chro in chro_dict.keys()):
chro_dict[chro] = []
chro_dict[chro].append([start, end])
# print("Before Sorting:")
# for chro in chro_dict.keys():
# for entry in chro_dict[chro]:
# print(f"{chro}\t{entry}")
# print("Before Sorting:")
# for chro in chro_dict.keys():
# print(f"{chro}\t{len(chro_dict[chro])}")
# print(chro)
# print(chro_dict[chro])
## Merge the overlapping regions for each chromosome
for chro in chro_dict.keys():
## merge all overlapping intervalls
chro_dict_merged[chro] = merge_overlappingIntervals(chro_dict[chro])
## add a "False" to each entry
for entry in chro_dict_merged[chro]:
entry.append(False)
# print("After Sorting:")
# for chro in chro_dict_merged.keys():
# print(f"{chro}\t{len(chro_dict_merged[chro])}")
# for entry in chro_dict_merged[chro]:
# print(f"{chro}\t{entry}")
## 2. Check for each PCR region if a PCR region from the same overlap region has already been reported
with open(pcrProductfile, 'r') as infile:
for line in infile:
# print(line)
# print(line.split())
if line.startswith("chromosome"):
short_line = line.split()[2:]
short_line = "\t".join(short_line)
print('chromosome\tstart_region\tend_region\tstrand\t' + short_line.strip() + '\tbest_koords\tregion_koords')
continue
else:
chro = line.split()[0]
start = int(line.split()[2])
end = int(line.split()[8])
igv_koords = chro + ':' + str(start) + '-' + str(end)
line = line.strip() + '\t' + igv_koords
## 2. Add only the best PCR region for each overlapping region
## Check if region on this chromosome exists already
if chro in chro_dict_merged.keys():
## check for all already listed regions if there is an overlap
for entry in chro_dict_merged[chro]:
# print(f"{chro}\t{entry}")
## check if any region is overlapped
if(((start <= entry[0]) and (entry[0] <= end)) or ((entry[0] <= start) and (start <= entry[1]))):
# print(entry)
# print(start)
## regions are overlapping. If something from this overlap has been used before, do not add.
if entry[2] == False:
# print(entry)
## Add the koordinates from the complete region
# line = line + '\t' + chro + ':' + str(entry[0]) + '-' + str(entry[1])
short_line = line.split()[2:]
short_line = "\t".join(short_line)
line = chro + '\t' + str(start) + '\t' + str(end) + '\t' + line.split()[1] + '\t' + short_line + '\t' + chro + ':' + str(entry[0]) + '-' + str(entry[1])
resList.append(line)
entry[2] = True
else:
continue
# print(resList)
for topRegion in resList:
print(topRegion)
## TODO: write output to file