-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathBarley_Pseudomolecules_to_Parts.py
executable file
·406 lines (387 loc) · 15.5 KB
/
Barley_Pseudomolecules_to_Parts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
#!/usr/bin/env python3
"""Convert from IPK full pseudomolecules to parts. Requires argparse."""
import sys
import argparse
import cyvcf2
from cyvcf2 import VCF
# Define the length of the part1 pieces as a dictionary constant.
# Morex v1 sizes
PARTS_SIZES_V1 = {
'chr1H': 312837513,
'chr2H': 393532674,
'chr3H': 394310633,
'chr4H': 355061206,
'chr5H': 380865482,
'chr6H': 294822070,
'chr7H': 325797516,
'chrUn': 249774706,
'Pt': 999999999999
}
# Morex v2 sizes including plastids
PARTS_SIZES_V2 = {
'chr1H': 205502676,
'chr2H': 305853815,
'chr3H': 271947776,
'chr4H': 282386439,
'chr5H': 205989812,
'chr6H': 260041240,
'chr7H': 328847863,
'chrUn': 85026395,
'EF115541.1': 136462,
'AP017301.1': 525599,
'Pt': 999999999999
}
# Morex v3 sizes
PARTS_SIZES_V3 = {
'chr1H': 206486643,
'chr2H': 301293086,
'chr3H': 267852507,
'chr4H': 276149121,
'chr5H': 204878572,
'chr6H': 256319444,
'chr7H': 328847192,
'chrUn': 29110253,
'EF115541.1': 136462,
'AP017301.1': 525599,
'Pt': 999999999999
}
def parse_args():
"""Set up an argument parser, and actually parse the args."""
parser = argparse.ArgumentParser(
description='Convert pseudomolecule coordinates to parts coordinates',
add_help=True)
# Add a mutually exclusive group for the file format. This is so that only
# one format can be specified.
fmt = parser.add_mutually_exclusive_group(required=True)
fmt.add_argument(
'--vcf',
'-v',
action='store_true',
help='Input file is VCF')
fmt.add_argument(
'--bed',
'-b',
action='store_true',
help='Input file is BED')
fmt.add_argument(
'--gff',
'-g',
action='store_true',
help='Input file is GFF')
fmt.add_argument(
'--gtf',
'-t',
action='store_true',
help='Input file is GTF')
fmt.add_argument(
'--reg',
'-r',
action='store_true',
help='Input file is SAMTools/ANGSD regions')
parser.add_argument(
'intervals',
metavar='file',
help='File with intervals to convert')
parser.add_argument(
'ref_version',
metavar='reference_version',
help='Input is one of the following: morex_v1, morex_v2, morex_v3')
a = parser.parse_args()
return a
def set_format(args):
"""Set the format that is being used for conversion."""
arg_dict = vars(args)
if arg_dict['vcf']:
return 'VCF'
elif arg_dict['bed']:
return 'BED'
elif arg_dict['gff']:
return 'GFF'
elif arg_dict['gtf']:
return 'GTF'
elif arg_dict['reg']:
return 'REG'
else:
return None
def vcf_conv(intervals, parts_sizes):
"""Convert VCF. Works for both GATK vcfs and larger SVs with "INFO/END" field
from callers like Sniffles2, cuteSV, Longranger."""
# If lines start with '#', print them
vcf = VCF(intervals)
# Print header lines to stdout
# Update header with new chromosome part names and lengths
for l in vcf.raw_header.strip('\n').split('\n'):
if l.startswith('##contig'):
if 'chrUn' in l or 'EF115541.1' in l or 'AP017301.1' in l:
# Print as is, no modifications needed
print(l)
else:
# Split into chromosome parts
contig = l.split('=')
chrom_len = contig[3].strip('>')
chrom_list = contig[2].split(',')
# Prepare new chromosome parts names
chrom_part1_name = chrom_list[0] + '_part1'
chrom_part2_name = chrom_list[0] + '_part2'
# Get new parts chromosome lengths
chrom_part1_len = parts_sizes[chrom_list[0]]
chrom_part2_len = int(chrom_len) - int(chrom_part1_len)
# Check if chromosome parts lengths add up correctly
if str(chrom_part1_len + chrom_part2_len) == str(chrom_len):
# Prepare updated contig header lines
# Each chromosome header line should get split into two parts
print('='.join([contig[0], contig[1], chrom_part1_name + ',length', str(chrom_part1_len) + '>']))
print('='.join([contig[0], contig[1], chrom_part2_name + ',length', str(chrom_part2_len) + '>']))
else:
print(l)
# Check if we have INFO/END field present in the VCF
# If so, we'll need to pull the end position from the END field
# This is a larger SV with the end position in the INFO/END field
# (e.g., output from callers like Sniffles2, cuteSV, and Longranger)
if vcf.contains("END"):
for variant in vcf:
chrom = variant.CHROM
start_pos = variant.POS
# Position in INFO/END field
end_pos = variant.INFO["END"]
# Check that the chromosomes are named as we are expecting
if chrom not in parts_sizes:
sys.stderr.write(chrom + ' not recognized. The chromosomes must be named like \'chr1H.\'\n')
return
# Check the parts lengths. If the position is greater than the
# part1 length, then subtract the part1 length, and change the
# name to part2
# Also passing 'chrUn' unchanged, as there is only 1 part
limit = parts_sizes[chrom]
if chrom == 'chrUn' or chrom == 'Pt':
# No changes made, print as is
print(str(variant).strip('\n'))
elif start_pos > limit and end_pos > limit:
# This is important, otherwise conversion for records that span
# both chrom parts will be incorrect
# Modify in current record
# New chromosome name
variant.CHROM = newchrom = chrom + '_part2'
# New start position
variant.set_pos(start_pos - limit)
# New end position
variant.INFO["END"] = end_pos - limit
# Print modified record to stdout
print(str(variant).strip('\n'))
elif start_pos < limit and end_pos <= limit:
# Modify in current record
# New chromosome name
variant.CHROM = chrom + '_part1'
# Print modified record to stdout
print(str(variant).strip('\n'))
elif start_pos <= limit and end_pos > limit:
# Record spans both chrom parts, exit with error
sys.stderr.write(chrom + ':' + str(start_pos) + '-' + str(end_pos) + ' spans chrom parts, please check manually before proceeding.\n')
return
else:
sys.stderr.write(chrom + ':' + str(start_pos) + '-' + str(end_pos) + ' is an edge case that is not dealt with properly, please check manually and modify code if necessary before proceeding.\n')
return
else:
# no INFO/END field present (e.g., vcf output from GATK)
for variant in vcf:
chrom = variant.CHROM
pos = variant.POS
# Check that the chromosomes are named as we are expecting
if chrom not in parts_sizes:
sys.stderr.write(chrom + ' not recognized. The chromosomes must be named like \'chr1H.\'\n')
return
# Check the parts lengths. If the position is greater than the
# part1 length, then subtract the part1 length, and change the
# name to part2
# Also passing 'chrUn' unchanged, as there is only 1 part
limit = parts_sizes[chrom]
if chrom == 'chrUn' or chrom == 'Pt':
# No changes made, print as is
print(str(variant).strip('\n'))
elif pos > limit:
# Modify in current record
# New chromosome name
variant.CHROM = newchrom = chrom + '_part2'
# New start position
variant.set_pos(pos - limit)
# Print modified record to stdout
print(str(variant).strip('\n'))
elif pos <= limit:
# Modify in current record
# New chromosome name
variant.CHROM = chrom + '_part1'
# Print modified record to stdout
print(str(variant).strip('\n'))
else:
sys.stderr.write(chrom + ':' + str(start_pos) + '-' + str(end_pos) + ' is an edge case that is not dealt with properly, please check manually and modify code if necessary before proceeding.\n')
return
def bed_conv(intervals, parts_sizes):
"""Convert BED."""
with open(intervals, 'r') as f:
for line in f:
if line.startswith('#'):
print(line.strip())
continue
tmp = line.strip().split()
chrom = tmp[0]
# Check that the chromosomes are named as we are expecting
if chrom not in parts_sizes:
sys.stderr.write(chrom + ' not reconized. The chromosomes must be named like \'chr1H.\'\n')
return
start = int(tmp[1])
end = int(tmp[2])
# Check the coordinates of the start and stop positions with respect
# to the part1 boundary
limit = parts_sizes[chrom]
if chrom == 'chrUn' or chrom == 'Pt':
newchrom = chrom
newstart = str(start)
newend = str(end)
elif (start + 1) > limit and (end + 1) > limit:
newchrom = chrom + '_part2'
newstart = str(start - limit)
newend = str(end - limit)
elif (start + 1) <= limit and (end + 1) > limit:
sys.stderr.write('Interval ' + line.strip() + ' spans parts, omitting.\n')
continue
else:
newchrom = chrom + '_part1'
newstart = str(start)
newend = str(end)
print('\t'.join([newchrom, newstart, newend]))
return
def gff_conv(intervals, parts_sizes):
"""Convert GFF."""
with open(intervals, 'r') as f:
for line in f:
if line.startswith('#'):
if line.startswith('##sequence-region'):
tmp_header = line.strip().split()
curr_chrom = tmp_header[1]
start_pos = tmp_header[2]
end_pos = tmp_header[3]
if curr_chrom != "chrUn":
# Generate ##sequence-region header for split parts
curr_part1_size = parts_sizes[curr_chrom]
# Print part1
print(tmp_header[0], " ", curr_chrom + '_part1 ', str(start_pos), ' ', str(curr_part1_size))
# Print part2
print(tmp_header[0], " ", curr_chrom + '_part2 ', str(curr_part1_size + 1), ' ', str(end_pos))
else:
# chrUn usually isn't split
print(line.strip())
else:
print(line.strip())
continue
tmp = line.strip().split('\t')
chrom = tmp[0]
start = int(tmp[3])
end = int(tmp[4])
if chrom not in parts_sizes:
sys.stderr.write(chrom + ' not reconized. The chromosomes must be named like \'chr1H.\'\n')
return
limit = parts_sizes[chrom]
if chrom == 'chrUn' or chrom == 'Pt':
newchrom = chrom
newstart = str(start)
newend = str(end)
elif start > limit and end > limit:
newchrom = chrom + '_part2'
newstart = str(start - limit)
newend = str(end - limit)
elif start+1 <= limit and end+1 > limit:
sys.stderr.write('Interval ' + line.strip() + ' spans parts, omitting.\n')
continue
else:
newchrom = chrom + '_part1'
newstart = str(start)
newend = str(end)
print('\t'.join([newchrom, tmp[1], tmp[2], newstart, newend] + tmp[5:]))
return
def gtf_conv(intervals, parts_sizes):
"""Convert GFF."""
with open(intervals, 'r') as f:
for line in f:
tmp = line.strip().split()
chrom = tmp[0]
start = int(tmp[3])
end = int(tmp[4])
if chrom not in parts_sizes:
sys.stderr.write(chrom + ' not reconized. The chromosomes must be named like \'chr1H.\'\n')
return
limit = parts_sizes[chrom]
if chrom == 'chrUn' or chrom == 'Pt':
newchrom = chrom
newstart = str(start)
newend = str(end)
elif start > limit and end > limit:
newchrom = chrom + '_part2'
newstart = str(start - limit)
newend = str(end - limit)
elif start+1 <= limit and end+1 > limit:
sys.stderr.write('Interval ' + line.strip() + ' spans parts, omitting.\n')
continue
else:
newchrom = chrom + '_part1'
newstart = str(start)
newend = str(end)
print('\t'.join([newchrom, tmp[1], tmp[2], newstart, newend] + tmp[5:]))
return
def reg_conv(intervals, parts_sizes):
"""Convert SAM region."""
with open(intervals, 'r') as f:
for line in f:
if ':' not in line or '-' not in line:
sys.stderr.write(line.strip() + ' is not formatted properly. This script expects chr:start-stop format.\n')
return
tmp = line.strip().split(':')
chrom = tmp[0]
start, end = [int(i) for i in tmp[1].split('-')]
limit = parts_sizes[chrom]
if chrom == 'chrUn' or chrom == 'Pt':
newchrom = chrom
newstart = str(start)
newend = str(end)
elif start > limit and end > limit:
newchrom = chrom + '_part2'
newstart = str(start - limit)
newend = str(end - limit)
elif start+1 <= limit and end+1 > limit:
sys.stderr.write('Interval ' + line.strip() + ' spans parts, omitting.\n')
continue
else:
newchrom = chrom + '_part1'
newstart = str(start)
newend = str(end)
print(newchrom + ':' + newstart + '-' + newend)
return
def main():
"""Main function."""
args = parse_args()
f = set_format(args)
# Check which version of the Morex reference we are using
if args.ref_version == "morex_v1":
parts_sizes = PARTS_SIZES_V1
elif args.ref_version == "morex_v2":
parts_sizes = PARTS_SIZES_V2
elif args.ref_version == "morex_v3":
parts_sizes = PARTS_SIZES_V3
else:
print('Invalid reference version provided, valid options are: morex_v1, morex_v2, morex_v3')
exit(1)
# Then, use the format and the supplied file to convert the coordinates
if f == 'VCF':
vcf_conv(args.intervals, parts_sizes)
elif f == 'BED':
bed_conv(args.intervals, parts_sizes)
elif f == 'GFF':
gff_conv(args.intervals, parts_sizes)
elif f == 'GTF':
gtf_conv(args.intervals, parts_sizes)
elif f == 'REG':
reg_conv(args.intervals, parts_sizes)
else:
print('Unrecognized format, weird.')
exit(1)
return
main()