Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
katarinaelez authored Jan 24, 2019
1 parent bd5d3bf commit 4799689
Show file tree
Hide file tree
Showing 6 changed files with 234 additions and 0 deletions.
Binary file added models/model.npz
Binary file not shown.
44 changes: 44 additions & 0 deletions src/gor-predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python

from __future__ import print_function
from utils import parse_pssm, parse_fasta, seq_to_profile

import sys

if __name__ == "__main__":

try:
import argparse
import numpy as np
except ImportError as e:
print('This program requires Python 2.7+ and numpy', file=sys.stderr)
raise ImportError(e)

parser = argparse.ArgumentParser(description='Use the GOR method to predict protein secondary structure from a PSSM or from a FASTA file.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--pssm', type=str, help='File containing the PSSM profile.')
group.add_argument('--fasta', type=str, help='File containing the FASTA sequence.')
parser.add_argument('filename_model', type=str, help='File containing the GOR model.')
args = parser.parse_args()

model = np.load(args.filename_model)['model'].item()

if args.pssm:
profile = parse_pssm(args.pssm)
else:
profile = seq_to_profile(parse_fasta(args.fasta))

dssp = ''

for i in range(0, len(profile)):
half_window_size = int((len(model['-'])-1)/2)
score_H, score_E, score_C = 0, 0, 0
for j in range(max(0,i-half_window_size), min(i+half_window_size+1,len(profile))):
for k in range(0, len(profile[i])):
score_H += profile[j][k] * model['H'][j-i+half_window_size][k]
score_E += profile[j][k] * model['E'][j-i+half_window_size][k]
score_C += profile[j][k] * model['-'][j-i+half_window_size][k]
scores = [score_H, score_E, score_C]
dssp += ['H', 'E', '-'][scores.index(max(scores))]

print(dssp)
49 changes: 49 additions & 0 deletions src/gor-train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env python

from __future__ import print_function
from utils import parse_pssm, parse_dssp

import sys

if __name__ == "__main__":

try:
import argparse
import numpy as np
except ImportError as e:
print('This program requires Python 2.7+ and numpy', file=sys.stderr)
raise ImportError(e)

parser = argparse.ArgumentParser(description='Train the GOR method on PSSM and DSSP files.')
parser.add_argument('filename_id_list', type=str, help='File containing a list of ids.')
parser.add_argument('dir_pssm', type=str, help='Directory containing the PSSM files.')
parser.add_argument('dir_dssp', type=str, help='Directory containing the DSSP files.')
parser.add_argument('--filename_model', type=str, default='model', help='Name for the model file. Default is "model".')
parser.add_argument('--window_size', type=int, default=17, help='Size of the window to be considered when building the model. Default is 17.')
args = parser.parse_args()

with open(args.filename_id_list) as id_list:

model = {}
model['H'] = np.zeros((args.window_size,20),dtype=float)
model['E'] = np.zeros((args.window_size,20),dtype=float)
model['-'] = np.zeros((args.window_size,20),dtype=float)

for line in id_list:
line = line.rstrip()
profile = np.array(parse_pssm(args.dir_pssm+'/'+line+'.pssm'))
if np.sum(profile) != 0:
ss = parse_dssp(args.dir_dssp+'/'+line+'.dssp')
ss = ss.replace('C','-')
for i in range(0, len(profile)):
half_window_size = int((args.window_size-1)/2)
for j in range(max(0,i-half_window_size), min(i+half_window_size+1,len(profile))):
for k in range(0, len(profile[i])):
matrix = model[ss[i]]
matrix[j-i+half_window_size][k] += float(profile[j][k])
model[ss[i]] = matrix

model['H'] = np.divide(model['H'], np.sum(model['H'], axis=1).reshape(17,1))
model['E'] = np.divide(model['E'], np.sum(model['E'], axis=1).reshape(17,1))
model['-'] = np.divide(model['-'], np.sum(model['-'], axis=1).reshape(17,1))
np.savez(args.filename_model+'.npz', model=model)
51 changes: 51 additions & 0 deletions src/svm-predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

from __future__ import print_function
from utils import parse_pssm, parse_fasta, seq_to_profile

import sys

if __name__ == "__main__":

try:
import argparse
import pickle
import numpy as np
from sklearn import svm
except ImportError as e:
print('This program requires Python 2.7+, numpy and scikit-learn', file=sys.stderr)
raise ImportError(e)

parser = argparse.ArgumentParser(description='Use the SVM method to predict protein secondary structure from a PSSM or from a FASTA file.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--pssm', type=str, help='File containing the PSSM profile.')
group.add_argument('--fasta', type=str, help='File containing the FASTA sequence.')
parser.add_argument('--probs', action='store_true', help='Output class probabilities together with the prediction.')
parser.add_argument('filename_model', type=str, help='File containing the model.')
args = parser.parse_args()

model = pickle.load(open(args.filename_model, 'rb'))

if args.pssm:
profile = np.array(parse_pssm(args.pssm))
else:
profile = np.array(seq_to_profile(parse_fasta(args.fasta)))

dssp = ''

for i in range(0, len(profile)):
half_window_size = int((len(model.support_vectors_[0])/20-1)/2)
part1 = np.zeros(20*max(0, half_window_size-i))
part2 = np.ndarray.flatten(profile[max(0,i-half_window_size):min(i+half_window_size+1,len(profile))])
part3 = np.zeros(20*max(0, half_window_size-(len(profile)-i-1)))
vec = np.concatenate((part1, part2, part3))
if args.probs:
probs = model.predict_proba(vec.reshape(1, -1))[0]
dssp += ' '.join([str(round(prob, 3)).rjust(5) for prob in probs])
dssp += ' ' + ['H', 'E', '-'][np.argmax(probs)] + '\n'
else:
dssp += ['H', 'E', '-'][model.predict(vec.reshape(1, -1))[0]]

if args.probs:
dssp = dssp[:-1]
print(dssp)
51 changes: 51 additions & 0 deletions src/svm-train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

from __future__ import print_function
from utils import parse_pssm, parse_dssp

import sys

if __name__ == "__main__":

try:
import argparse
import pickle
import numpy as np
from sklearn import svm
except ImportError as e:
print('This program requires Python 2.7+, numpy and scikit-learn', file=sys.stderr)
raise ImportError(e)

parser = argparse.ArgumentParser(description='Train the SVM method on PSSM and DSSP files.')
parser.add_argument('filename_id_list', type=str, help='File containing a list of ids.')
parser.add_argument('dir_pssm', type=str, help='Directory containing the PSSM files.')
parser.add_argument('dir_dssp', type=str, help='Directory containing the DSSP files.')
parser.add_argument('--filename_model', type=str, default='model', help='Name for the model file. Default is "model".')
parser.add_argument('--window_size', type=int, default=17, help='Size of the window to be considered when building the model. Default is 17.')
args = parser.parse_args()

with open(args.filename_id_list) as id_list:

X, y = [], []

ss_map = {'H': 0, 'E': 1, 'C': 2, '-': 2}

for line in id_list:
line = line.rstrip()
profile = np.array(parse_pssm(args.dir_pssm+'/'+line+'.pssm'))
if np.sum(profile) != 0:
ss = parse_dssp(args.dir_dssp+'/'+line+'.dssp')
for i in range(0, len(profile)):
half_window_size = int((args.window_size-1)/2)
part1 = np.zeros(20*max(0, half_window_size-i))
part2 = np.ndarray.flatten(profile[max(0,i-half_window_size):min(i+half_window_size+1,len(profile))])
part3 = np.zeros(20*max(0, half_window_size-(len(profile)-i-1)))
vec = np.concatenate((part1, part2, part3))
X.append(vec.tolist())
y.append(ss_map[ss[i]])

X, y = np.array(X), np.array(y)

model = svm.SVC(kernel='rbf',C=1,gamma=0.125,probability=True)
model.fit(X, y)
pickle.dump(model, open(args.filename_model+'.sav', 'wb'))
39 changes: 39 additions & 0 deletions src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/python

def parse_pssm(pssm_filename):
profile = []
with open(pssm_filename) as pssm:
pssm_lines = pssm.readlines()
for line in pssm_lines[3:-6]:
profile_line = []
for n in line.rstrip().split()[22:-2]:
profile_line.append(float(n)/100)
profile.append(profile_line)
return profile

def parse_dssp(dssp_filename):
ss = ''
with open(dssp_filename) as dssp:
dssp.readline()
ss = dssp.readline().rstrip()
return ss

def parse_fasta(fasta_filename):
seq = ''
with open(fasta_filename) as fasta:
fasta.readline()
seq = fasta.readline().rstrip()
return seq

def seq_to_profile(seq):
profile = []
aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
for res in seq:
profile_line = []
for aa in aa_list:
if res == aa:
profile_line.append(1)
else:
profile_line.append(0)
profile.append(profile_line)
return profile

0 comments on commit 4799689

Please sign in to comment.