-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathGenoMatrix_to_FASTA.py
executable file
·78 lines (72 loc) · 2.65 KB
/
GenoMatrix_to_FASTA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
"""A script to take a genotyping matrix with population assignment and produce
FASTA files for each population. This was written with K. Thornton's
libsequence tools in mind. This script will also remove monomorphic sites.
Assumes that samples are rows and markers are columns. The first column has a
population of origin and the second column is the individual ID."""
import sys
genotype_matrix = sys.argv[1]
missing = 'NA'
# This dictionary will contain the population genetic data that we are
# reading from the input file. It will be of the form
# {
# 'Pop1_ID': {
# 'PIs': [PIs],
# 'Genotypes': [genotypes],
# ...
# }
# 'Pop2_ID': {
# ...
# }
# ...
# }
popdata = {}
with open(genotype_matrix, 'r') as f:
for index, line in enumerate(f):
if index == 0:
continue
else:
tmp = line.strip().split()
# Get the population identifier and PI number
popid = tmp[0]
pi_no = tmp[1]
# Assign some population IDs. This is to do the combined
# population summary stats.
# if popid == '1' or popid == '2':
# popid = '12'
# Stick it into the dictionary
if popid not in popdata:
popdata[popid] = {
'PIs': [],
'Genotypes': []
}
popdata[popid]['PIs'].append(pi_no)
popdata[popid]['Genotypes'].append(tmp[2:])
# Next, we iterate through the dictionary, removing monomorphic markers
for pop in popdata:
gen_mat = popdata[pop]['Genotypes']
# Transpose the genotype matrix so we can iterate through the markers
gen_mat = zip(*gen_mat)
# And start a new matrix for filtered data
filtered = []
for marker in gen_mat:
# what are the states?
alleles = set(marker)
# Discard any missing data
alleles.discard(missing)
# If they are monomorphic after discarding missing data, then toss it
if len(alleles) == 1:
continue
else:
filtered.append(marker)
# Then, we transpose it back
filtered = zip(*filtered)
# And write it out!
handle = open('Pop_' + pop + '.fasta', 'w')
for pi, genotypes in zip(popdata[pop]['PIs'], filtered):
# We need to convert the NA into N
new_geno = ['N' if x == 'NA' else x for x in genotypes]
towrite = '>' + pi + '\n' + ''.join(new_geno) + '\n'
handle.write(towrite)
handle.flush()
handle.close()