-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathinterpro_tsv2function.py
More file actions
executable file
·51 lines (43 loc) · 1.73 KB
/
interpro_tsv2function.py
File metadata and controls
executable file
·51 lines (43 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
"""Parse InterProScan tsv output and report predicted function for each gene/protein.
If fasta file is given, it will also report annotated fasta file (.with_functions.fasta).
cat tsv | python tsv2function.py [fasta]
"""
import sys, argparse
from Bio import SeqIO
parser = argparse.ArgumentParser(description="Parse InterProScan tsv to get functions")
parser.add_argument("-i", "--inFile", dest="inFile", required=True, help="InterProScan tsv file")
args = parser.parse_args()
inFile = args.inFile
protid2function = {}
for l in open(inFile):
l = l.strip()
ldata = l.split('\t')
if len(ldata)>=12:
protid, function = ldata[0], ldata[12]
if function.startswith("NULL"):
pass
else:
if function != '-':
#add prot to dict
if protid not in protid2function:
protid2function[protid] = set()
#add function
protid2function[protid].add(function)
#decide if split by pipe
pipeSplit = False
pipes_in_id = len(list(filter(lambda protid: "|" in protid, protid2function.keys())))
#split protid by pipe if not all protids contain pipe
if len(protid2function)>pipes_in_id:
pipeSplit = True
outfn = inFile.split('.')[0]+".with_functions.txt"
outfile = open(outfn,'w')
for protid, functions in sorted(protid2function.items()):
# if pipeSplit:
# for p in protid.split('|'):
# print (p, "; ".join(functions),sep='\t',file=outfile)
# else:
print (protid, "; ".join(list(functions)),sep='\t',file=outfile)
outfile.close()
sys.stderr.write("#%s unique proteins annotated with %s functions\n" % (len(protid2function),sum(len(x) for x in protid2function.values())))
print ("Saved annotated FastA as: %s"%outfn)