diff --git a/models/model.npz b/models/model.npz new file mode 100644 index 0000000..13270fa Binary files /dev/null and b/models/model.npz differ diff --git a/src/gor-predict.py b/src/gor-predict.py new file mode 100644 index 0000000..d844f37 --- /dev/null +++ b/src/gor-predict.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +from __future__ import print_function +from utils import parse_pssm, parse_fasta, seq_to_profile + +import sys + +if __name__ == "__main__": + + try: + import argparse + import numpy as np + except ImportError as e: + print('This program requires Python 2.7+ and numpy', file=sys.stderr) + raise ImportError(e) + + parser = argparse.ArgumentParser(description='Use the GOR method to predict protein secondary structure from a PSSM or from a FASTA file.') + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--pssm', type=str, help='File containing the PSSM profile.') + group.add_argument('--fasta', type=str, help='File containing the FASTA sequence.') + parser.add_argument('filename_model', type=str, help='File containing the GOR model.') + args = parser.parse_args() + + model = np.load(args.filename_model)['model'].item() + + if args.pssm: + profile = parse_pssm(args.pssm) + else: + profile = seq_to_profile(parse_fasta(args.fasta)) + + dssp = '' + + for i in range(0, len(profile)): + half_window_size = int((len(model['-'])-1)/2) + score_H, score_E, score_C = 0, 0, 0 + for j in range(max(0,i-half_window_size), min(i+half_window_size+1,len(profile))): + for k in range(0, len(profile[i])): + score_H += profile[j][k] * model['H'][j-i+half_window_size][k] + score_E += profile[j][k] * model['E'][j-i+half_window_size][k] + score_C += profile[j][k] * model['-'][j-i+half_window_size][k] + scores = [score_H, score_E, score_C] + dssp += ['H', 'E', '-'][scores.index(max(scores))] + + print(dssp) diff --git a/src/gor-train.py b/src/gor-train.py new file mode 100644 index 0000000..b8092df --- /dev/null +++ b/src/gor-train.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +from __future__ import print_function +from utils import parse_pssm, parse_dssp + +import sys + +if __name__ == "__main__": + + try: + import argparse + import numpy as np + except ImportError as e: + print('This program requires Python 2.7+ and numpy', file=sys.stderr) + raise ImportError(e) + + parser = argparse.ArgumentParser(description='Train the GOR method on PSSM and DSSP files.') + parser.add_argument('filename_id_list', type=str, help='File containing a list of ids.') + parser.add_argument('dir_pssm', type=str, help='Directory containing the PSSM files.') + parser.add_argument('dir_dssp', type=str, help='Directory containing the DSSP files.') + parser.add_argument('--filename_model', type=str, default='model', help='Name for the model file. Default is "model".') + parser.add_argument('--window_size', type=int, default=17, help='Size of the window to be considered when building the model. Default is 17.') + args = parser.parse_args() + + with open(args.filename_id_list) as id_list: + + model = {} + model['H'] = np.zeros((args.window_size,20),dtype=float) + model['E'] = np.zeros((args.window_size,20),dtype=float) + model['-'] = np.zeros((args.window_size,20),dtype=float) + + for line in id_list: + line = line.rstrip() + profile = np.array(parse_pssm(args.dir_pssm+'/'+line+'.pssm')) + if np.sum(profile) != 0: + ss = parse_dssp(args.dir_dssp+'/'+line+'.dssp') + ss = ss.replace('C','-') + for i in range(0, len(profile)): + half_window_size = int((args.window_size-1)/2) + for j in range(max(0,i-half_window_size), min(i+half_window_size+1,len(profile))): + for k in range(0, len(profile[i])): + matrix = model[ss[i]] + matrix[j-i+half_window_size][k] += float(profile[j][k]) + model[ss[i]] = matrix + + model['H'] = np.divide(model['H'], np.sum(model['H'], axis=1).reshape(17,1)) + model['E'] = np.divide(model['E'], np.sum(model['E'], axis=1).reshape(17,1)) + model['-'] = np.divide(model['-'], np.sum(model['-'], axis=1).reshape(17,1)) + np.savez(args.filename_model+'.npz', model=model) diff --git a/src/svm-predict.py b/src/svm-predict.py new file mode 100644 index 0000000..6cbb8b1 --- /dev/null +++ b/src/svm-predict.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +from __future__ import print_function +from utils import parse_pssm, parse_fasta, seq_to_profile + +import sys + +if __name__ == "__main__": + + try: + import argparse + import pickle + import numpy as np + from sklearn import svm + except ImportError as e: + print('This program requires Python 2.7+, numpy and scikit-learn', file=sys.stderr) + raise ImportError(e) + + parser = argparse.ArgumentParser(description='Use the SVM method to predict protein secondary structure from a PSSM or from a FASTA file.') + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--pssm', type=str, help='File containing the PSSM profile.') + group.add_argument('--fasta', type=str, help='File containing the FASTA sequence.') + parser.add_argument('--probs', action='store_true', help='Output class probabilities together with the prediction.') + parser.add_argument('filename_model', type=str, help='File containing the model.') + args = parser.parse_args() + + model = pickle.load(open(args.filename_model, 'rb')) + + if args.pssm: + profile = np.array(parse_pssm(args.pssm)) + else: + profile = np.array(seq_to_profile(parse_fasta(args.fasta))) + + dssp = '' + + for i in range(0, len(profile)): + half_window_size = int((len(model.support_vectors_[0])/20-1)/2) + part1 = np.zeros(20*max(0, half_window_size-i)) + part2 = np.ndarray.flatten(profile[max(0,i-half_window_size):min(i+half_window_size+1,len(profile))]) + part3 = np.zeros(20*max(0, half_window_size-(len(profile)-i-1))) + vec = np.concatenate((part1, part2, part3)) + if args.probs: + probs = model.predict_proba(vec.reshape(1, -1))[0] + dssp += ' '.join([str(round(prob, 3)).rjust(5) for prob in probs]) + dssp += ' ' + ['H', 'E', '-'][np.argmax(probs)] + '\n' + else: + dssp += ['H', 'E', '-'][model.predict(vec.reshape(1, -1))[0]] + + if args.probs: + dssp = dssp[:-1] + print(dssp) diff --git a/src/svm-train.py b/src/svm-train.py new file mode 100644 index 0000000..887a921 --- /dev/null +++ b/src/svm-train.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +from __future__ import print_function +from utils import parse_pssm, parse_dssp + +import sys + +if __name__ == "__main__": + + try: + import argparse + import pickle + import numpy as np + from sklearn import svm + except ImportError as e: + print('This program requires Python 2.7+, numpy and scikit-learn', file=sys.stderr) + raise ImportError(e) + + parser = argparse.ArgumentParser(description='Train the SVM method on PSSM and DSSP files.') + parser.add_argument('filename_id_list', type=str, help='File containing a list of ids.') + parser.add_argument('dir_pssm', type=str, help='Directory containing the PSSM files.') + parser.add_argument('dir_dssp', type=str, help='Directory containing the DSSP files.') + parser.add_argument('--filename_model', type=str, default='model', help='Name for the model file. Default is "model".') + parser.add_argument('--window_size', type=int, default=17, help='Size of the window to be considered when building the model. Default is 17.') + args = parser.parse_args() + + with open(args.filename_id_list) as id_list: + + X, y = [], [] + + ss_map = {'H': 0, 'E': 1, 'C': 2, '-': 2} + + for line in id_list: + line = line.rstrip() + profile = np.array(parse_pssm(args.dir_pssm+'/'+line+'.pssm')) + if np.sum(profile) != 0: + ss = parse_dssp(args.dir_dssp+'/'+line+'.dssp') + for i in range(0, len(profile)): + half_window_size = int((args.window_size-1)/2) + part1 = np.zeros(20*max(0, half_window_size-i)) + part2 = np.ndarray.flatten(profile[max(0,i-half_window_size):min(i+half_window_size+1,len(profile))]) + part3 = np.zeros(20*max(0, half_window_size-(len(profile)-i-1))) + vec = np.concatenate((part1, part2, part3)) + X.append(vec.tolist()) + y.append(ss_map[ss[i]]) + + X, y = np.array(X), np.array(y) + + model = svm.SVC(kernel='rbf',C=1,gamma=0.125,probability=True) + model.fit(X, y) + pickle.dump(model, open(args.filename_model+'.sav', 'wb')) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..5235ca1 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,39 @@ +#!/usr/bin/python + +def parse_pssm(pssm_filename): + profile = [] + with open(pssm_filename) as pssm: + pssm_lines = pssm.readlines() + for line in pssm_lines[3:-6]: + profile_line = [] + for n in line.rstrip().split()[22:-2]: + profile_line.append(float(n)/100) + profile.append(profile_line) + return profile + +def parse_dssp(dssp_filename): + ss = '' + with open(dssp_filename) as dssp: + dssp.readline() + ss = dssp.readline().rstrip() + return ss + +def parse_fasta(fasta_filename): + seq = '' + with open(fasta_filename) as fasta: + fasta.readline() + seq = fasta.readline().rstrip() + return seq + +def seq_to_profile(seq): + profile = [] + aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] + for res in seq: + profile_line = [] + for aa in aa_list: + if res == aa: + profile_line.append(1) + else: + profile_line.append(0) + profile.append(profile_line) + return profile