-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcoordinateConverter1.py
executable file
·216 lines (190 loc) · 8.18 KB
/
coordinateConverter1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
""" Convert genomic coordinates between GRCh38 and GRCh37."""
__author__ = "Ray Stefancsik"
__version__ = "0.1"
######################################################################
# Import command-line parsing module from the Python standard library.
######################################################################
import argparse
#################################################################
# import modules recommended by
#### https://rest.ensembl.org/documentation/info/assembly_map
#################################################################
import requests
import sys
#################################################################
# input filename
fname = 'filename?'
parser = argparse.ArgumentParser( description='Convert genomic coordinates from GRCh38 to GRCh37 (default) or vice versa. All genomic coordinates are assumed to be on the positive strand. Input file is expected to be in a tabulated format (tsv/csv), e.g. :\n\t#chromosome start end\n\t17 36169091 36169091\n\tX 3084378 3084378', formatter_class=argparse.RawTextHelpFormatter )
parser.add_argument( 'input_file', help='Path to your input file.' )
# User options:
parser.add_argument('-v', '--verbose', help='Choose verbose output.', action='store_true')
# Optional, but mutually exclusive user options:
## genome build options
group1 = parser.add_mutually_exclusive_group()
group1.add_argument('-38', '--grch38', help='GRCh38 input assembly (default).', action='store_true')
group1.add_argument('-37', '--grch37', help='GRCh37 input assembly.', action='store_true')
## input file format options
group2 = parser.add_mutually_exclusive_group()
group2.add_argument('-t', '--tab', help='Use tab as column separator (default).', action='store_true')
group2.add_argument('-c', '--csv', help='Use comma as column separator.', action='store_true')
args = parser.parse_args()
# input filename
fname = args.input_file
##########################################################################
# define coordinate converter function
#################################################################
# https://rest.ensembl.org/documentation/info/assembly_map
# GET map/:species/:asm_one/:region/:asm_two
#################################################################
##########################################################################
def convertCoordinates( asmDirection, query_region ):
""" Convert a set of genomic coordinates."""
#################################################################
#default values in case ENSEMBL api failed to map coordinates between genome assemblies:
dfv = {
'original': {
'seq_region_name': 'failed',
'start': 'failed',
'coord_system': 'chromosome',
'strand': 1,
'assembly': asm[0],
'end': 'failed'
},
'mapped': {
'seq_region_name': 'failed',
'start': 'failed',
'coord_system': 'chromosome',
'strand': 'failed',
'assembly': asm[1],
'end': 'failed'
}
}
#default values in case ENSEMBL api finds multiple possible mapping coordinates between genome assemblies:
dfv2 = {
'original': {
'seq_region_name': 'multiple',
'start': 'multiple',
'coord_system': 'chromosome',
'strand': 1,
'assembly': asm[0],
'end': 'multiple'
},
'mapped': {
'seq_region_name': 'multiple',
'start': 'multiple',
'coord_system': 'chromosome',
'strand': 'multiple',
'assembly': asm[1],
'end': 'multiple'
}
}
#################################################################
server = 'https://rest.ensembl.org'
ext1 = '/map/human/'
asm_one = asmDirection[0]
queryRegion = '/' + query_region[0] + ':' + query_region[1] + '..' + query_region[2] + ':1/' # seq_region_name + start + stop
asm_two = asmDirection[1] + '?'
r = requests.get(server+ext1+asm_one+queryRegion+asm_two, headers={ "Content-Type" : "application/json"})
if not r.ok:
error1 = r.raise_for_status()
decoded = error1
decoded = r.json()
if len( decoded['mappings']) == 1: # check if query region maps to a single locus
mappings = ( decoded['mappings'][0] )
elif len( decoded['mappings']) > 1: # check if query region maps to multiple sites
mappings = dfv2
else: # conversion has failed
mappings = dfv
return(mappings) # returns a dictionary like in the following example:
##########################################################################
#### {
#### 'original': {
#### 'seq_region_name': 'X',
#### 'start': 1000000,
#### 'coord_system': 'chromosome',
#### 'strand': 1,
#### 'assembly': 'GRCh38',
#### 'end': 1000100
#### },
#### 'mapped': {
#### 'seq_region_name': 'HG480_HG481_PATCH',
#### 'start': 960735,
#### 'coord_system': 'chromosome',
#### 'strand': '1',
#### 'assembly': 'GRCh37',
#### 'end': 960835
#### }
#### }
##########################################################################
#################################################################
# Parse user input,
# read in a data file and
# initialise global variables.
#################################################################
### Input file format:
if args.csv:
delimiter = ',' #csv file format
else:
delimiter = '\t' #tsv file format
### Direction of conversion
if args.csv:
delimiter = ',' #csv file format
else:
delimiter = '\t' #tsv file format
# genome assembly versions for the direction of conversion
asm_one = 'GRCh37'
asm_two = 'GRCh38'
if args.grch37:
asm = tuple([asm_one, asm_two])
else:
asm = tuple([asm_two, asm_one])
# open input file
try:
fhand1 = open( fname )
except:
print('File cannot be opened:', fname)
exit()
# initialise variable to hold input
inputList = []
locus = tuple()
# Make an empty dictionary to hold unique query regions.
## This is to reduce the number of queries to the server in case of redundant query data.
loci = {}
# read in file line-by-line
for line in fhand1:
if line.startswith('#'): # skip header line
continue
line = line.strip()
columns = line.split(delimiter)
if len(columns) > 1: # check if line is not empty
locus = tuple(columns[:3])
inputList.append(locus)
else: # skip empty lines
continue
if locus in loci:
continue
else:
loci[locus] = convertCoordinates( asm, locus )
# close file
fhand1.close()
#################################################################
# Format and print results
#################################################################
# verbose output?
if args.verbose:
header = print( ' '.join(['#', asm[0], 'input coordinates']), 'Converted to assembly', 'seq_region_name', 'start', 'end', 'strand', sep='\t' )
for i in inputList:
results = [ loci[i]['mapped']['assembly'], loci[i]['mapped']['seq_region_name'], loci[i]['mapped']['start'], loci[i]['mapped']['end'], loci[i]['mapped']['strand'] ]
inputCoordinates = list(i)
# print(i, loci[i] )
# print( i, '\t'.join( str(r) for r in results), sep='\t')
print( ':'.join( str(c) for c in inputCoordinates), '\t'.join( str(r) for r in results), sep='\t')
# print( loci[i] )
else:
header = print( ' '.join(['#', asm[1], 'seq_region_name']), 'start', 'end', sep='\t' )
for i in inputList:
# results = [ loci[i]['mapped']['seq_region_name'], loci[i]['mapped']['start'], loci[i]['mapped']['end'], loci[i]['mapped']['assembly'], loci[i]['mapped']['strand'] ]
results = [ loci[i]['mapped']['seq_region_name'], loci[i]['mapped']['start'], loci[i]['mapped']['end'] ]
print( '\t'.join( str(r) for r in results), sep='\t')