-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_fasta.py
83 lines (71 loc) · 2.54 KB
/
split_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python3
import os
import numpy as np
from tqdm import tqdm
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
def mkdir_p(dir):
if not os.path.exists(dir):
os.makedirs(dir)
return
def main():
parser = ArgumentParser(
formatter_class=ArgumentDefaultsHelpFormatter,
description="split a .fasta.aln file generated by ViralMSA.py into separate files")
parser.add_argument("wrkdir", type=str, help="working directory")
parser.add_argument("fasta",type=str, help="path to .fasta.aln file generated by ViralMSA.py")
opts = parser.parse_args()
dir = opts.wrkdir
file = opts.fasta
outdir = os.path.join(dir, "aligned_genomes")
mkdir_p(outdir)
allseqs = open(file, "r")
names = []
positions = []
seqline = 0
for position, seq in enumerate(allseqs):
##skip ncbi genome
if position == 0:
continue
else:
if seq.startswith(">"):
terms = seq.split(">")
name = terms[len(terms)-1]
names.append(str.rstrip(name))
positions.append(position)
with open(file, "r") as master:
master_full = master.readlines()
shortnames = []
for j in tqdm(np.arange(0, len(names))):
name = names[j]
nickname = name.replace("/", "_")
shortnames.append(nickname)
fastafile = os.path.join(dir,"aligned_genomes", nickname+".fa")
with open(fastafile, "w+") as fasta:
#print(name)
fasta.write(">"+name+"\n")
start = positions[j]
if j == len(positions)-1:
end = len(master_full)
else:
end = positions[j+1]
linenums = np.arange(start+1, end)
for i in linenums:
seqline = master_full[i]
seqline = seqline.replace("a", "A")
seqline = seqline.replace("t", "T")
seqline = seqline.replace("c", "C")
seqline = seqline.replace("g", "G")
seqline = seqline.replace("N", "-")
fasta.write(seqline)
##no backslashes
genomelist = os.path.join(dir, "genomefile_list")
with open(genomelist, "w+") as genomes:
for name in shortnames:
genomes.write(name+"\n")
genomelist = os.path.join(dir, "genome_name_list")
with open(genomelist, "w+") as genomes:
for name in names:
genomes.write(name+"\n")
print("number of seqs: %s" % len(names))
if __name__ == "__main__":
main()