-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathVCF_to_ESTSFS.py
163 lines (137 loc) · 6.52 KB
/
VCF_to_ESTSFS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/python3
"""Convert a VCF and BED files with genotype information for outgroup samples
into input for est-sfs. The primary dependency is the `cyvcf2` library for VCF
parsing.
"""
# Script by Jacob Pacheco to convert the output of est-sfs
# [doi:10.1534/genetics.118.301120] into inferred ancestral state.
# Should be suitable for annotation of a VCF or creating a list of
# ancestral state for a list of SNPs.
import gzip
import sys
from cyvcf2 import VCF
def est_vcf_convert(bed_file1, bed_file2, vcf_file, output_file, bed_file3=None):
'''This script takes three tab-delimited BED files with the following fields:
1 -chromosome
2 -zero-based variant position
3 -one-based variant position
4 -SNP name (or ".")
5 -reference allele
6 -alternate allele
7 -allele present at the position in an outgroup sample
8 -dependencies VCF, gzip, sys
The BED files need to be provided in order so that outgroup samples most closely
related to the focal species appear earlier (farther left in file order).
Outputs are designed for input for ancestral state inference in EST-SFS
and a VCF compatible file'''
# GZIPPED VCF FILES CURRENTLY DO NOT WORK
if vcf_file.endswith('.gz'):
vcf_file = gzip.open(vcf_file, 'rt')
else:
vcf_file = open(vcf_file, 'r')
if bed_file1.endswith('.gz'):
bed_file1 = gzip.open(bed_file1, 'rt')
else:
bed_file1 = open(bed_file1, 'r')
if bed_file2.endswith('.gz'):
bed_file2 = gzip.open(bed_file2, 'rt')
else:
bed_file2 = open(bed_file2, 'r')
if bed_file3:
if bed_file3.endswith('.gz'):
bed_file3 = gzip.open(bed_file3, 'rt')
else:
bed_file3 = open(bed_file3, 'r')
# Read BED files
bed_file1_lines = bed_file1.read().splitlines()
bed_file2_lines = bed_file2.read().splitlines()
if bed_file3:
bed_file3_lines = bed_file3.read().splitlines()
# Read VCF file
vcf_reader = VCF(vcf_file)
VCF_metadata = []
for record in vcf_reader:
chrom = record.CHROM if record.CHROM is not None else '.'
pos = record.POS - 1 if record.POS is not None else '.'
record_id = record.ID if record.ID is not None else '.'
ref = record.REF if record.REF is not None else '.'
alt = record.ALT[0] if record.ALT and record.ALT[0] is not None else '.'
ns = record.INFO.get('NS', '.')
ac = record.INFO.get('AC', '.')
VCF_metadata.append([chrom, pos, record_id, ref, alt, ns, ac])
# Close files
vcf_file.close()
bed_file1.close()
bed_file2.close()
if bed_file3:
bed_file3.close()
# Process BED and VCF data
nucleotide_Base_binary = {"A": "1,0,0,0", "C": "0,1,0,0", "G": "0,0,1,0", "T": "0,0,0,1"}
nucleotide_ref_binary = {"AT": [1,0,0,1], "AG": [1,0,1,0], "AC": [1,1,0,0], "CT": [0,1,0,1], "CG": [0,1,1,0], "CA": [1,1,0,0], "TG": [0,0,1,1], "TC": [0,0,1,1], "TA": [1,0,0,1], "GT": [0,0,1,1], "GC": [0,0,1,1], "GA": [1,0,1,0], 'A': [1,0,0,0], "C": [0,1,0,0], "G": [0,0,1,0], "T": [0,0,0,1]}
with open(output_file, "w") as outfile:
for i in range(len(bed_file1_lines)):
bed_file1_split = bed_file1_lines[i].split()
bed_file2_split = bed_file2_lines[i].split()
if bed_file3:
bed_file3_split = bed_file3_lines[i].split()
AC = VCF_metadata[i][-1]
if isinstance(AC, int):
OutputAC = f'AC={int(AC)}'
calcAC = AC
else:
calcAC = 0
OutputAC = '.'
NS = VCF_metadata[i][5]
if isinstance(NS, int):
OutputNS = f'NS={int(NS)}'
calcNS = NS
else:
calcNS = 0
OutputNS = '.'
ReferencealleleFreq = ((int(calcNS)*2) - (int(calcAC))) // 2
Alternateallelefreq = int(calcAC) // 2
reference_allele = str(VCF_metadata[i][4])
alternate_allele = str(VCF_metadata[i][3])
genotypesReference = nucleotide_ref_binary.get(reference_allele, [0,0,0,0])
genotypesReference = list(map(lambda x: x * Alternateallelefreq, genotypesReference))
genotypesAlternate = nucleotide_ref_binary.get(alternate_allele, [0,0,0,0])
genotypesAlternate = list(map(lambda x: x * ReferencealleleFreq, genotypesAlternate))
genotypes = [genotypesReference[j] + genotypesAlternate[j] for j in range(len(genotypesAlternate))]
result_str = ','.join(str(k) for k in genotypes)
BED_file1_binary_conversion = nucleotide_Base_binary.get(bed_file1_split[-1], "0,0,0,0")
BED_file2_binary_conversion = nucleotide_Base_binary.get(bed_file2_split[-1], "0,0,0,0")
if bed_file3:
BED_file3_binary_conversion = nucleotide_Base_binary.get(bed_file3_split[-1], "0,0,0,0")
converted_file = (
f"{VCF_metadata[i][1]}\t{VCF_metadata[i][2]}\t{VCF_metadata[i][3]}\t"
f"{VCF_metadata[i][4]}\t{OutputAC}\t{OutputNS}\t"
f"{bed_file1_split[-1]}\t{bed_file2_split[-1]}\t{bed_file3_split[-1]}\t"
)
EST_SFS_VCF_compatible_file = (result_str + "\t" +
BED_file1_binary_conversion + "\t" +
BED_file2_binary_conversion + "\t" +
BED_file3_binary_conversion + "\n")
outfile.write(EST_SFS_VCF_compatible_file)
else:
converted_file = (
f"{VCF_metadata[i][2]}\t{VCF_metadata[i][3]}\t{VCF_metadata[i][4]}\t"
f"{OutputAC}\t{OutputNS}\t"
f"{bed_file1_split[-1]}\t{bed_file2_split[-1]}\t"
)
EST_SFS_VCF_compatible_file = (result_str + "\t" +
BED_file1_binary_conversion + "\t" +
BED_file2_binary_conversion + "\n")
outfile.write(EST_SFS_VCF_compatible_file)
if len(sys.argv) == 6:
bed_file1 = sys.argv[1]
bed_file2 = sys.argv[2]
bed_file3 = sys.argv[3]
vcf_file = sys.argv[4]
output_file = sys.argv[5]
est_vcf_convert(bed_file1, bed_file2, vcf_file, output_file, bed_file3)
elif len(sys.argv) == 5:
bed_file1 = sys.argv[1]
bed_file2 = sys.argv[2]
vcf_file = sys.argv[3]
output_file = sys.argv[4]
est_vcf_convert(bed_file1, bed_file2, vcf_file, output_file)