-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
bd5d3bf
commit 4799689
Showing
6 changed files
with
234 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
from utils import parse_pssm, parse_fasta, seq_to_profile | ||
|
||
import sys | ||
|
||
if __name__ == "__main__": | ||
|
||
try: | ||
import argparse | ||
import numpy as np | ||
except ImportError as e: | ||
print('This program requires Python 2.7+ and numpy', file=sys.stderr) | ||
raise ImportError(e) | ||
|
||
parser = argparse.ArgumentParser(description='Use the GOR method to predict protein secondary structure from a PSSM or from a FASTA file.') | ||
group = parser.add_mutually_exclusive_group(required=True) | ||
group.add_argument('--pssm', type=str, help='File containing the PSSM profile.') | ||
group.add_argument('--fasta', type=str, help='File containing the FASTA sequence.') | ||
parser.add_argument('filename_model', type=str, help='File containing the GOR model.') | ||
args = parser.parse_args() | ||
|
||
model = np.load(args.filename_model)['model'].item() | ||
|
||
if args.pssm: | ||
profile = parse_pssm(args.pssm) | ||
else: | ||
profile = seq_to_profile(parse_fasta(args.fasta)) | ||
|
||
dssp = '' | ||
|
||
for i in range(0, len(profile)): | ||
half_window_size = int((len(model['-'])-1)/2) | ||
score_H, score_E, score_C = 0, 0, 0 | ||
for j in range(max(0,i-half_window_size), min(i+half_window_size+1,len(profile))): | ||
for k in range(0, len(profile[i])): | ||
score_H += profile[j][k] * model['H'][j-i+half_window_size][k] | ||
score_E += profile[j][k] * model['E'][j-i+half_window_size][k] | ||
score_C += profile[j][k] * model['-'][j-i+half_window_size][k] | ||
scores = [score_H, score_E, score_C] | ||
dssp += ['H', 'E', '-'][scores.index(max(scores))] | ||
|
||
print(dssp) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
from utils import parse_pssm, parse_dssp | ||
|
||
import sys | ||
|
||
if __name__ == "__main__": | ||
|
||
try: | ||
import argparse | ||
import numpy as np | ||
except ImportError as e: | ||
print('This program requires Python 2.7+ and numpy', file=sys.stderr) | ||
raise ImportError(e) | ||
|
||
parser = argparse.ArgumentParser(description='Train the GOR method on PSSM and DSSP files.') | ||
parser.add_argument('filename_id_list', type=str, help='File containing a list of ids.') | ||
parser.add_argument('dir_pssm', type=str, help='Directory containing the PSSM files.') | ||
parser.add_argument('dir_dssp', type=str, help='Directory containing the DSSP files.') | ||
parser.add_argument('--filename_model', type=str, default='model', help='Name for the model file. Default is "model".') | ||
parser.add_argument('--window_size', type=int, default=17, help='Size of the window to be considered when building the model. Default is 17.') | ||
args = parser.parse_args() | ||
|
||
with open(args.filename_id_list) as id_list: | ||
|
||
model = {} | ||
model['H'] = np.zeros((args.window_size,20),dtype=float) | ||
model['E'] = np.zeros((args.window_size,20),dtype=float) | ||
model['-'] = np.zeros((args.window_size,20),dtype=float) | ||
|
||
for line in id_list: | ||
line = line.rstrip() | ||
profile = np.array(parse_pssm(args.dir_pssm+'/'+line+'.pssm')) | ||
if np.sum(profile) != 0: | ||
ss = parse_dssp(args.dir_dssp+'/'+line+'.dssp') | ||
ss = ss.replace('C','-') | ||
for i in range(0, len(profile)): | ||
half_window_size = int((args.window_size-1)/2) | ||
for j in range(max(0,i-half_window_size), min(i+half_window_size+1,len(profile))): | ||
for k in range(0, len(profile[i])): | ||
matrix = model[ss[i]] | ||
matrix[j-i+half_window_size][k] += float(profile[j][k]) | ||
model[ss[i]] = matrix | ||
|
||
model['H'] = np.divide(model['H'], np.sum(model['H'], axis=1).reshape(17,1)) | ||
model['E'] = np.divide(model['E'], np.sum(model['E'], axis=1).reshape(17,1)) | ||
model['-'] = np.divide(model['-'], np.sum(model['-'], axis=1).reshape(17,1)) | ||
np.savez(args.filename_model+'.npz', model=model) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
from utils import parse_pssm, parse_fasta, seq_to_profile | ||
|
||
import sys | ||
|
||
if __name__ == "__main__": | ||
|
||
try: | ||
import argparse | ||
import pickle | ||
import numpy as np | ||
from sklearn import svm | ||
except ImportError as e: | ||
print('This program requires Python 2.7+, numpy and scikit-learn', file=sys.stderr) | ||
raise ImportError(e) | ||
|
||
parser = argparse.ArgumentParser(description='Use the SVM method to predict protein secondary structure from a PSSM or from a FASTA file.') | ||
group = parser.add_mutually_exclusive_group(required=True) | ||
group.add_argument('--pssm', type=str, help='File containing the PSSM profile.') | ||
group.add_argument('--fasta', type=str, help='File containing the FASTA sequence.') | ||
parser.add_argument('--probs', action='store_true', help='Output class probabilities together with the prediction.') | ||
parser.add_argument('filename_model', type=str, help='File containing the model.') | ||
args = parser.parse_args() | ||
|
||
model = pickle.load(open(args.filename_model, 'rb')) | ||
|
||
if args.pssm: | ||
profile = np.array(parse_pssm(args.pssm)) | ||
else: | ||
profile = np.array(seq_to_profile(parse_fasta(args.fasta))) | ||
|
||
dssp = '' | ||
|
||
for i in range(0, len(profile)): | ||
half_window_size = int((len(model.support_vectors_[0])/20-1)/2) | ||
part1 = np.zeros(20*max(0, half_window_size-i)) | ||
part2 = np.ndarray.flatten(profile[max(0,i-half_window_size):min(i+half_window_size+1,len(profile))]) | ||
part3 = np.zeros(20*max(0, half_window_size-(len(profile)-i-1))) | ||
vec = np.concatenate((part1, part2, part3)) | ||
if args.probs: | ||
probs = model.predict_proba(vec.reshape(1, -1))[0] | ||
dssp += ' '.join([str(round(prob, 3)).rjust(5) for prob in probs]) | ||
dssp += ' ' + ['H', 'E', '-'][np.argmax(probs)] + '\n' | ||
else: | ||
dssp += ['H', 'E', '-'][model.predict(vec.reshape(1, -1))[0]] | ||
|
||
if args.probs: | ||
dssp = dssp[:-1] | ||
print(dssp) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env python | ||
|
||
from __future__ import print_function | ||
from utils import parse_pssm, parse_dssp | ||
|
||
import sys | ||
|
||
if __name__ == "__main__": | ||
|
||
try: | ||
import argparse | ||
import pickle | ||
import numpy as np | ||
from sklearn import svm | ||
except ImportError as e: | ||
print('This program requires Python 2.7+, numpy and scikit-learn', file=sys.stderr) | ||
raise ImportError(e) | ||
|
||
parser = argparse.ArgumentParser(description='Train the SVM method on PSSM and DSSP files.') | ||
parser.add_argument('filename_id_list', type=str, help='File containing a list of ids.') | ||
parser.add_argument('dir_pssm', type=str, help='Directory containing the PSSM files.') | ||
parser.add_argument('dir_dssp', type=str, help='Directory containing the DSSP files.') | ||
parser.add_argument('--filename_model', type=str, default='model', help='Name for the model file. Default is "model".') | ||
parser.add_argument('--window_size', type=int, default=17, help='Size of the window to be considered when building the model. Default is 17.') | ||
args = parser.parse_args() | ||
|
||
with open(args.filename_id_list) as id_list: | ||
|
||
X, y = [], [] | ||
|
||
ss_map = {'H': 0, 'E': 1, 'C': 2, '-': 2} | ||
|
||
for line in id_list: | ||
line = line.rstrip() | ||
profile = np.array(parse_pssm(args.dir_pssm+'/'+line+'.pssm')) | ||
if np.sum(profile) != 0: | ||
ss = parse_dssp(args.dir_dssp+'/'+line+'.dssp') | ||
for i in range(0, len(profile)): | ||
half_window_size = int((args.window_size-1)/2) | ||
part1 = np.zeros(20*max(0, half_window_size-i)) | ||
part2 = np.ndarray.flatten(profile[max(0,i-half_window_size):min(i+half_window_size+1,len(profile))]) | ||
part3 = np.zeros(20*max(0, half_window_size-(len(profile)-i-1))) | ||
vec = np.concatenate((part1, part2, part3)) | ||
X.append(vec.tolist()) | ||
y.append(ss_map[ss[i]]) | ||
|
||
X, y = np.array(X), np.array(y) | ||
|
||
model = svm.SVC(kernel='rbf',C=1,gamma=0.125,probability=True) | ||
model.fit(X, y) | ||
pickle.dump(model, open(args.filename_model+'.sav', 'wb')) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/python | ||
|
||
def parse_pssm(pssm_filename): | ||
profile = [] | ||
with open(pssm_filename) as pssm: | ||
pssm_lines = pssm.readlines() | ||
for line in pssm_lines[3:-6]: | ||
profile_line = [] | ||
for n in line.rstrip().split()[22:-2]: | ||
profile_line.append(float(n)/100) | ||
profile.append(profile_line) | ||
return profile | ||
|
||
def parse_dssp(dssp_filename): | ||
ss = '' | ||
with open(dssp_filename) as dssp: | ||
dssp.readline() | ||
ss = dssp.readline().rstrip() | ||
return ss | ||
|
||
def parse_fasta(fasta_filename): | ||
seq = '' | ||
with open(fasta_filename) as fasta: | ||
fasta.readline() | ||
seq = fasta.readline().rstrip() | ||
return seq | ||
|
||
def seq_to_profile(seq): | ||
profile = [] | ||
aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'] | ||
for res in seq: | ||
profile_line = [] | ||
for aa in aa_list: | ||
if res == aa: | ||
profile_line.append(1) | ||
else: | ||
profile_line.append(0) | ||
profile.append(profile_line) | ||
return profile |