-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_extragene.py
61 lines (54 loc) · 1.97 KB
/
remove_extragene.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python3
import os
import numpy as np
from tqdm import tqdm
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
##this is to remove the extra copy of the genomic positions 266 to 13483 that appear in the sars-cov-2
##xmfa files (it appears twice in the ncbi gff because of the structure of ORF1ab)
def main():
parser = ArgumentParser(
formatter_class=ArgumentDefaultsHelpFormatter,
description="split a .fasta.aln file generated by ViralMSA.py into separate files")
parser.add_argument("wrkdir", type=str, help="working directory")
parser.add_argument("xmfa_file",type=str, help="path to xmfa file generated by CollectGeneAlignments")
opts = parser.parse_args()
dir = opts.wrkdir
file = opts.xmfa_file
outdir = dir
allseqs = open(file, "r")
names = []
positions = []
seqline = 0
last = -1
badpositions = []
for position, seq in enumerate(allseqs):
##skip ncbi genome
if seq.startswith(">"):
terms = seq.split(" ")
gene_pos = terms[1]
gene_pos = (str.rstrip(gene_pos))
if gene_pos == "266+13483":
badpositions.append(position)
positions.append(position)
##the positions we don't want
badpositions = set(badpositions)
with open(file, "r") as master:
master_full = master.readlines()
outfile = file
with open(outfile, "w+") as out:
for j in tqdm(np.arange(0, len(positions))):
if positions[j] in badpositions:
continue
header = master_full[positions[j]]
out.write(header)
start = positions[j]
if j == len(positions)-1:
end = len(master_full)
else:
end = positions[j+1]
linenums = np.arange(start+1, end)
for i in linenums:
seqline = master_full[i]
out.write(seqline)
if __name__ == "__main__":
main()