-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathVCF_To_Htable.py
executable file
·118 lines (112 loc) · 5.04 KB
/
VCF_To_Htable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
# A python program to read in a VCF file and output a Hudson table-like
# format. Not "true" Hudson table, since hets are not handled properly.
# Usage:
# VCF_To_Htable.py [VCF file] > [Htable.txt]
# Peter L. Morrell - updated 18 Jan 2023 - 1) now Python3, 2) has only distance in
# header and not chromosome name, 3) now displays "haploid" data with only
# first base of each genotype printed. Still need to deal with phased genotypes where
# "0|1" or "1|1" can appear rather than "0/1" or "1/1".
import sys
# If the "minor genotype frequency" falls below this threshhold, then we
# omit the site.
MAFThreshhold = 0.05
# A function to calculate the minor allele frequency
def MAF(x):
# get the set of genotypes in the list
genotypes = set(x)
# start counting up the frequencies
freqs = []
for g in genotypes:
freqs.append(x.count(g)/float(len(x)))
return min(freqs)
# Empty lists for the genotype matrix and the loci
loci = []
g_matrix = []
# start reading through the file. We will skip any lines that start wtih '##'
with open(sys.argv[1], 'r') as f:
for index, line in enumerate(f):
if index%10000 == 0:
sys.stderr.write('Read ' + str(index) + ' sites.\n')
if line.startswith('##'):
continue
# #CHROM line is the one that contains the sample information
elif line.startswith('#CHROM'):
# Split up the line on tabs
tmp = line.strip().split('\t')
# Look for the 'FORMAT' field
format_field = tmp.index('FORMAT')
# And get the samples out of the list
# Add 1 to the format_field because we don't actually want to
# include 'FORMAT' in the sample info
samples = tmp[format_field + 1:]
# Write a little diagnostic message
sys.stderr.write(sys.argv[1] + ' has ' + str(len(samples)) + ' samples.\n')
# Now that we have the number and names of the samples, we print the
# genotype data
else:
# we split the line as above
tmp = line.strip().split('\t')
# assign the variables for clarity
scaffold = tmp[0]
locus = tmp[1]
var_id = tmp[2]
ref_allele = tmp[3]
# If there are multiple alternate alleles, they are listed with a comma between them
alt_alleles = tmp[4].split(',')
qual = tmp[5]
filt = tmp[6]
info = tmp[7]
format = tmp[8]
# And then the genotypes
genotypes = tmp[9:]
# The locus will be the scaffold number and then the bp position
#locus = scaffold + '_' + pos
# then we parse the genotypes
# we create a list here that will be a single column of the genotype matrix
g_column = []
for g in genotypes:
# In the genotype string, the first element (separated by :) is the actual genotype call
call = g.split(':')[0]
# These are diploid calls, and we are assuming they are unphased
# the are listed in the form allele1/allele2
# with 0 = ref, 1 = alt1, 2 = alt2, and so on...
alleles = call.split('/')[0]
individual_call = ''
for x in alleles:
if x == '.':
individual_call += 'N'
else:
# cast to integer so we can use it in slicing a list
c = int(x)
# if it's 0, we just tack on the reference state
if c == 0:
individual_call += ref_allele
else:
# Otherwise, we use it to slice the list of alternate alleles
individual_call += alt_alleles[c-1]
# Then append the individual call to the column for the genotype matrix
g_column.append(individual_call)
# Then, append that column to the genotype matrix
# If there is no variation in genotype calls (that is, all lines have the same genotype)
# then we don't care about it
unique_calls = set(g_column)
if len(unique_calls) <= 1:
continue
else:
if MAF(g_column) > MAFThreshhold:
g_matrix.append(g_column)
loci.append(locus)
else:
continue
# Now, we have to transpose the genotype matrix
g_matrix_t = zip(*g_matrix)
# print the number of samples and the number of loci
print (str(len(samples)) + '\t' + str(len(loci)))
# print the loci
print ('\t' + '\t'.join(loci))
# Print the line for unknown ancestral state
print ('anc\t' + '?\t'*(len(loci)-1) + '?')
# then print the transposed genotype matrix
for index, g in enumerate(g_matrix_t):
print (samples[index] + '\t' + '\t'.join(g))