First commit

katarinaelez · Jan 24, 2019 · 4799689 · 4799689
1 parent bd5d3bf
commit 4799689
Show file tree

Hide file tree

Showing 6 changed files with 234 additions and 0 deletions.
diff --git a/models/model.npz b/models/model.npz
diff --git a/src/gor-predict.py b/src/gor-predict.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+from utils import parse_pssm, parse_fasta, seq_to_profile
+
+import sys
+
+if __name__ == "__main__":
+
+	try:
+		import argparse
+		import numpy as np
+	except ImportError as e:
+		print('This program requires Python 2.7+ and numpy', file=sys.stderr)
+		raise ImportError(e)
+
+	parser = argparse.ArgumentParser(description='Use the GOR method to predict protein secondary structure from a PSSM or from a FASTA file.')
+	group = parser.add_mutually_exclusive_group(required=True)
+	group.add_argument('--pssm', type=str, help='File containing the PSSM profile.')
+	group.add_argument('--fasta', type=str, help='File containing the FASTA sequence.')
+	parser.add_argument('filename_model', type=str, help='File containing the GOR model.')
+	args = parser.parse_args()
+
+	model = np.load(args.filename_model)['model'].item()
+
+	if args.pssm:
+		profile = parse_pssm(args.pssm)
+	else:
+		profile = seq_to_profile(parse_fasta(args.fasta))
+
+	dssp = ''
+
+	for i in range(0, len(profile)):
+		half_window_size = int((len(model['-'])-1)/2)
+		score_H, score_E, score_C = 0, 0, 0
+		for j in range(max(0,i-half_window_size), min(i+half_window_size+1,len(profile))):
+			for k in range(0, len(profile[i])):
+				score_H += profile[j][k] * model['H'][j-i+half_window_size][k]
+				score_E += profile[j][k] * model['E'][j-i+half_window_size][k]
+				score_C += profile[j][k] * model['-'][j-i+half_window_size][k]
+		scores = [score_H, score_E, score_C]
+		dssp += ['H', 'E', '-'][scores.index(max(scores))]
+
+	print(dssp)
diff --git a/src/gor-train.py b/src/gor-train.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+from utils import parse_pssm, parse_dssp
+
+import sys
+
+if __name__ == "__main__":
+
+	try:
+		import argparse
+		import numpy as np
+	except ImportError as e:
+		print('This program requires Python 2.7+ and numpy', file=sys.stderr)
+		raise ImportError(e)
+
+	parser = argparse.ArgumentParser(description='Train the GOR method on PSSM and DSSP files.')
+	parser.add_argument('filename_id_list', type=str, help='File containing a list of ids.')
+	parser.add_argument('dir_pssm', type=str, help='Directory containing the PSSM files.')
+	parser.add_argument('dir_dssp', type=str, help='Directory containing the DSSP files.')
+	parser.add_argument('--filename_model', type=str, default='model', help='Name for the model file. Default is "model".')
+	parser.add_argument('--window_size', type=int, default=17, help='Size of the window to be considered when building the model. Default is 17.')
+	args = parser.parse_args()
+
+	with open(args.filename_id_list) as id_list:
+
+		model = {}
+		model['H'] = np.zeros((args.window_size,20),dtype=float)
+		model['E'] = np.zeros((args.window_size,20),dtype=float)
+		model['-'] = np.zeros((args.window_size,20),dtype=float)
+
+		for line in id_list:
+			line = line.rstrip()
+			profile = np.array(parse_pssm(args.dir_pssm+'/'+line+'.pssm'))
+			if np.sum(profile) != 0:
+				ss = parse_dssp(args.dir_dssp+'/'+line+'.dssp')
+				ss = ss.replace('C','-')
+				for i in range(0, len(profile)):
+					half_window_size = int((args.window_size-1)/2)
+					for j in range(max(0,i-half_window_size), min(i+half_window_size+1,len(profile))):
+						for k in range(0, len(profile[i])):
+							matrix = model[ss[i]]
+							matrix[j-i+half_window_size][k] += float(profile[j][k])
+							model[ss[i]] = matrix
+
+		model['H'] = np.divide(model['H'], np.sum(model['H'], axis=1).reshape(17,1))
+		model['E'] = np.divide(model['E'], np.sum(model['E'], axis=1).reshape(17,1))
+		model['-'] = np.divide(model['-'], np.sum(model['-'], axis=1).reshape(17,1))
+		np.savez(args.filename_model+'.npz', model=model)
diff --git a/src/svm-predict.py b/src/svm-predict.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+from utils import parse_pssm, parse_fasta, seq_to_profile
+
+import sys
+
+if __name__ == "__main__":
+
+	try:
+		import argparse
+		import pickle
+		import numpy as np
+		from sklearn import svm
+	except ImportError as e:
+		print('This program requires Python 2.7+, numpy and scikit-learn', file=sys.stderr)
+		raise ImportError(e)
+
+	parser = argparse.ArgumentParser(description='Use the SVM method to predict protein secondary structure from a PSSM or from a FASTA file.')
+	group = parser.add_mutually_exclusive_group(required=True)
+	group.add_argument('--pssm', type=str, help='File containing the PSSM profile.')
+	group.add_argument('--fasta', type=str, help='File containing the FASTA sequence.')
+	parser.add_argument('--probs', action='store_true', help='Output class probabilities together with the prediction.')
+	parser.add_argument('filename_model', type=str, help='File containing the model.')
+	args = parser.parse_args()
+
+	model = pickle.load(open(args.filename_model, 'rb'))
+
+	if args.pssm:
+		profile = np.array(parse_pssm(args.pssm))
+	else:
+		profile = np.array(seq_to_profile(parse_fasta(args.fasta)))
+
+	dssp = ''
+
+	for i in range(0, len(profile)):
+		half_window_size = int((len(model.support_vectors_[0])/20-1)/2)
+		part1 = np.zeros(20*max(0, half_window_size-i))
+		part2 = np.ndarray.flatten(profile[max(0,i-half_window_size):min(i+half_window_size+1,len(profile))])
+		part3 = np.zeros(20*max(0, half_window_size-(len(profile)-i-1)))
+		vec = np.concatenate((part1, part2, part3))
+		if args.probs:
+			probs = model.predict_proba(vec.reshape(1, -1))[0]
+			dssp += ' '.join([str(round(prob, 3)).rjust(5) for prob in probs])
+			dssp += ' ' + ['H', 'E', '-'][np.argmax(probs)] + '\n'
+		else:
+			dssp += ['H', 'E', '-'][model.predict(vec.reshape(1, -1))[0]]
+
+	if args.probs:
+		dssp = dssp[:-1]
+	print(dssp)
diff --git a/src/svm-train.py b/src/svm-train.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+from utils import parse_pssm, parse_dssp
+
+import sys
+
+if __name__ == "__main__":
+
+	try:
+		import argparse
+		import pickle
+		import numpy as np
+		from sklearn import svm
+	except ImportError as e:
+		print('This program requires Python 2.7+, numpy and scikit-learn', file=sys.stderr)
+		raise ImportError(e)
+
+	parser = argparse.ArgumentParser(description='Train the SVM method on PSSM and DSSP files.')
+	parser.add_argument('filename_id_list', type=str, help='File containing a list of ids.')
+	parser.add_argument('dir_pssm', type=str, help='Directory containing the PSSM files.')
+	parser.add_argument('dir_dssp', type=str, help='Directory containing the DSSP files.')
+	parser.add_argument('--filename_model', type=str, default='model', help='Name for the model file. Default is "model".')
+	parser.add_argument('--window_size', type=int, default=17, help='Size of the window to be considered when building the model. Default is 17.')
+	args = parser.parse_args()
+
+	with open(args.filename_id_list) as id_list:
+
+		X, y = [], []
+
+		ss_map = {'H': 0, 'E': 1, 'C': 2, '-': 2}
+
+		for line in id_list:
+			line = line.rstrip()
+			profile = np.array(parse_pssm(args.dir_pssm+'/'+line+'.pssm'))
+			if np.sum(profile) != 0:
+				ss = parse_dssp(args.dir_dssp+'/'+line+'.dssp')
+				for i in range(0, len(profile)):
+					half_window_size = int((args.window_size-1)/2)
+					part1 = np.zeros(20*max(0, half_window_size-i))
+					part2 = np.ndarray.flatten(profile[max(0,i-half_window_size):min(i+half_window_size+1,len(profile))])
+					part3 = np.zeros(20*max(0, half_window_size-(len(profile)-i-1)))
+					vec = np.concatenate((part1, part2, part3))
+					X.append(vec.tolist())
+					y.append(ss_map[ss[i]])
+
+		X, y = np.array(X), np.array(y)
+
+		model = svm.SVC(kernel='rbf',C=1,gamma=0.125,probability=True)
+		model.fit(X, y)
+		pickle.dump(model, open(args.filename_model+'.sav', 'wb'))
diff --git a/src/utils.py b/src/utils.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+
+def parse_pssm(pssm_filename):
+	profile = []
+	with open(pssm_filename) as pssm:
+		pssm_lines = pssm.readlines()
+		for line in pssm_lines[3:-6]:
+			profile_line = []
+			for n in line.rstrip().split()[22:-2]:
+				profile_line.append(float(n)/100)
+			profile.append(profile_line)
+	return profile
+
+def parse_dssp(dssp_filename):
+	ss = ''
+	with open(dssp_filename) as dssp:
+		dssp.readline()
+		ss = dssp.readline().rstrip()
+	return ss
+
+def parse_fasta(fasta_filename):
+	seq = ''
+	with open(fasta_filename) as fasta:
+		fasta.readline()
+		seq = fasta.readline().rstrip()
+	return seq
+
+def seq_to_profile(seq):
+	profile = []
+	aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
+	for res in seq:
+		profile_line = []
+		for aa in aa_list:
+			if res == aa:
+				profile_line.append(1)
+			else:
+				profile_line.append(0)
+		profile.append(profile_line)
+	return profile