-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathOutputAnalysis_v2.py
2169 lines (1789 loc) · 83.5 KB
/
OutputAnalysis_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
Module for analysis/comparison/etc of hits files.
#author: DP
#date: 8/14/2018
Updated 06/01/20 to analyze output from Annotatorv3
"""
import pandas as pd
import tkinter
from tkinter import filedialog
import pickle
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
from PyQt5 import QtWidgets
from terminalFragmentor_Main import FragmentSite
from terminalFragmentor_Main import print_hits
#Not directyy used by the module, but used by FragmentSite
from terminalFragmentor_Main import ThyIon
from Parameter_Parser_terminal import load_hits_file
from terminalFragmentor_Main import save_fragments
import re
import pygubu
CONFIG_FILE = 'config.txt' # config file for saving last directory for fancy filedialog
def compare_main():
"""
:return:
"""
file1 = filedialog.askopenfilename(title='Load CNTRL file', filetypes=[('Hits', '.hits')])
file2 = filedialog.askopenfilename(title='Load TEMPO file', filetypes=[('Hits', '.hits')])
# Transform paths into filenames
file1_outname = file1.split('/')
file1_outname = file1_outname[-1].strip('-avg.hits')
file2_outname = file2.split('/')
file2_outname = file2_outname[-1].strip('-avg.hits')
# Create output filename
output_title = f"{file2_outname}-{file1_outname}"
# Choose output directory
outdir = filedialog.askdirectory(title='Choose Output Folder')
# Run comparison
comparing_files(file1, file2, outdir, output_title=output_title)
def sitelist_to_sitedict(file):
"""
:param:
:return:
"""
sitelist = load_hits_file(file)
file = {}
for site in sitelist:
# print(site.hits)
thyions = []
for hit in site.hits:
# print(hit.thy_ion)
thyions.append(hit)
file[site] = thyions
protein_seq = get_protein_seq(sitelist)
return file, protein_seq
def comparing_files(file1, file2, outdir, output_title=None):
"""
Function to check what ions from a TEMPO file are in the CNTRL, whihc has been searched with TEMPO search space
:param files: .hits files
:param outdir: Directory to save the output files
:param output_title: name of output file
:return: void
"""
# Initialize variables
final_list = []
print('Comparing Files...')
dict1, protein_seq1 = sitelist_to_sitedict(file1)
dict2, protein_seq2 = sitelist_to_sitedict(file2)
dict3 = {}
if protein_seq1 == protein_seq2:
for site in dict1:
print(site)
# print(dict1[site])
# print(dict2[site])
thyion_ls = []
for hit1 in dict1[site]:
thyion_ls.append(hit1.thy_ion.mz_mono)
print(f"To remove = {thyion_ls}")
to_keep = []
for hit2 in dict2[site]:
print(hit2.thy_ion.mz_mono)
if hit2.thy_ion.mz_mono not in thyion_ls:
print(hit2.thy_ion.mz_mono)
to_keep.append(hit2)
print(f"To keep = {to_keep}")
dict3[site] = to_keep
else:
print("Files must be from the same sequence!")
print(dict3)
for site in dict2:
site.hits = dict3[site]
final_list.append(site)
output_name = os.path.join(outdir, output_title + '-avg.hits')
csv_outname = output_name.strip('.hits')
csv_outname += '.csv'
print_hits(protein_seq1, final_list, csv_outname)
print(output_name)
with open(output_name, 'wb') as picklefile:
pickle.dump(final_list, picklefile)
picklefile.close()
def merging_files(files, outdir, output_title=None):
"""
This function "merges" files that correspond to different passes of the same analysis (Same sample).
Usually for complex searches (e.g albumin with disulfide bonds)
:param files: .hits files
:param outdir: Directory to save the output files
:param output_title: name of output file
:return: void
"""
# Initilaize variables
final_list = []
counter = 0
hit_counter =0
mergefiles_dict = {}
protein_seq = []
for index, hits_file in enumerate(files):
print('analyzing file {} of {}'.format(index + 1, len(files)))
# print(hits_file)
sitelist = load_hits_file(hits_file)
print(f"Lenthg of sitelist: {len(sitelist)}")
protein_seq.append(get_protein_seq(sitelist))
for site in sitelist:
# print(site.hits)
thyions = []
for hit in site.hits:
# print(hit.thy_ion)
thyions.append(hit)
hit_counter += 1
#Loop to initialize and empty dictionary
if counter == 0:
mergefiles_dict[site] = thyions
else:
mergefiles_dict[site].extend(thyions)
counter += 1
print(counter)
print(f"before: {mergefiles_dict}")
for site in mergefiles_dict:
# print(site)
# print(avgfiles_dict[site])
# print(len(avgfiles_dict[site]))
# print(site.hits)
site.hits = list(mergefiles_dict[site])
# print(site.hits)
final_list.append(site)
print(final_list)
print(f"hit counter = {hit_counter}")
output_name = os.path.join(outdir, output_title + '-merged.hits')
csv_outname = output_name.strip('.hits')
csv_outname += '.csv'
print_hits(protein_seq[-1], final_list, csv_outname)
print(output_name)
with open(output_name, 'wb') as picklefile:
pickle.dump(final_list, picklefile)
picklefile.close()
def averaging_files(files, outdir, output_title=None):
"""
This function "averages" replicate files into finding the ions that appeared across all replicate files
:param files: .hits files
:param outdir: Directory to save the output files
:param output_title: name of output file
:return: void
"""
# Initilaize variables
final_list = []
counter = 0
avgfiles_dict = {}
protein_seq = []
for index, hits_file in enumerate(files):
print('analyzing file {} of {}'.format(index + 1, len(files)))
# print(hits_file)
sitelist = load_hits_file(hits_file)
# print(sitelist)
protein_seq.append(get_protein_seq(sitelist))
for site in sitelist:
# print(site.hits)
thyions = []
for hit in site.hits:
# print(hit.thy_ion)
thyions.append(hit)
if counter == 0:
avgfiles_dict[site] = thyions
else:
avgfiles_dict[site].extend(thyions)
counter += 1
print(counter)
print(f"before: {avgfiles_dict}")
for site in avgfiles_dict:
"""
This code is used to count how many times an ion occurs in a site after aggregating all replicates files
"""
unique_ions = set(avgfiles_dict[site])
# print(unique_ions)
ions_remove = []
for ion in unique_ions:
# print(f"{site} {ion} = {avgfiles_dict[site].count(ion)}")
# Ions need to be in all replicates to not be removed
if avgfiles_dict[site].count(ion) < counter:
ions_remove.append(ion)
# print(ions_remove)
avgfiles_dict[site] = unique_ions.difference(set(ions_remove))
print(f"after: {avgfiles_dict}")
for site in avgfiles_dict:
# print(site)
# print(avgfiles_dict[site])
# print(len(avgfiles_dict[site]))
# print(site.hits)
site.hits = list(avgfiles_dict[site])
# print(site.hits)
final_list.append(site)
print(final_list)
output_name = os.path.join(outdir, output_title + '-avg.hits')
csv_outname = output_name.strip('.hits')
csv_outname += '.csv'
print_hits(protein_seq[-1], final_list, csv_outname)
print(output_name)
with open(output_name, 'wb') as picklefile:
pickle.dump(final_list, picklefile)
picklefile.close()
def merging_batch():
"""
Code is a bit strange because when I started I was taking stuff out of object and putting them in a dictionary
In the process I ended up learning more about object and at the end I included the whole object in the dictionary
:param batch: Boolean, if True a .csv file with the desired files to be average will be taken from desired input directories
and will be saved in desired output folders with desired names
:return:
"""
batch_file = filedialog.askopenfilename(title='Merge_Batch Hits Files', filetypes=[('CSV File', '.csv')])
files_list = []
indir = ''
with open(batch_file, 'r') as batch:
lines = list(batch)
for line in lines:
if line.startswith("#"):
files_sub = []
files_list.append(files_sub)
continue
else:
# print(line)
line = line.strip("\n")
splits = line.split(',')
if splits[0]:
indir = splits[0]
outdir = splits[2]
output_title = splits[3]
files_sub.append(outdir)
files_sub.append(output_title)
files_sub.append(f"{indir}\{splits[1]}")
else:
files_sub.append(f"{indir}\{splits[1]}")
print(files_sub)
print(indir)
# p = os.access(indir, os.F_OK)
# print(p)
print(files_list)
print(outdir)
# print(output_title)
for file_set in files_list:
outdir = file_set[0]
output_title = file_set[1]
files = file_set[2:]
merging_files(files, outdir, output_title)
def average_main(batch = False):
"""
Code is a bit strange because when I started I was taking stuff out of object and putting them in a dictionary
In the process I ended up learning more about object and at the end I included the whole object in the dictionary
:param batch: Boolean, if True a .csv file with the desired files to be average will be taken from desired input directories
and will be saved in desired output folders with desired names
:return:
"""
if batch:
batch_file = filedialog.askopenfilename(title='Batch Hits Files', filetypes=[('CSV File', '.csv')])
files_list = []
indir = ''
with open(batch_file, 'r') as batch:
lines = list(batch)
for line in lines:
if line.startswith("#"):
files_sub = []
files_list.append(files_sub)
continue
else:
# print(line)
line = line.strip("\n")
splits = line.split(',')
if splits [0]:
indir = splits[0]
outdir = splits[2]
output_title = splits[3]
files_sub.append(outdir)
files_sub.append(output_title)
files_sub.append(f"{indir}\{splits[1]}")
else:
files_sub.append(f"{indir}\{splits[1]}")
print(files_sub)
print(indir)
# p = os.access(indir, os.F_OK)
# print(p)
print(files_list)
print(outdir)
# print(output_title)
for file_set in files_list:
if file_set:
print(f"file_set = {file_set}")
outdir = file_set[0]
output_title = file_set[1]
files = file_set[2:]
averaging_files(files, outdir, output_title)
else:
hitsfiles = filedialog.askopenfilenames(title='Load Hits Files', filetypes=[('Hits', '.hits')])
outdir = filedialog.askdirectory(title='Choose Output Folder')
output_title = os.path.basename(hitsfiles[0]).rstrip('.hits')
averaging_files(hitsfiles, outdir, output_title)
def main_ecdciu_plots(files, iontype_exclusion = None):
"""
Run sequence coverage analysis on the provided list of .hits files
:param files: (list of strings) full system paths to .hits files to analyze
:param extension: output plot extension (e.g. '.png)
:return: void
"""
# outputdir = os.path.dirname(files[0])
# It used to set the input directory as the output directory...Me no like it
# It gets too confusing. Modified to code so the user can choose the output directory
outputdir = filedialog.askdirectory(title='Choose Output Folder')
os.chdir(outputdir)
output_header = 'Sample name, seq_cov, max_int, N_Sites, C_Sites\n'
# plt.clf()
plt.figure(dpi=300)
fig, ax = plt.subplots(nrows=len(files), ncols=1, sharex=True,sharey=True)
subplot = 0
for index, hits_file in enumerate(files):
print('analyzing file {} of {}'.format(index + 1, len(files)))
sitelist = load_hits_file(hits_file)
protein_seq = get_protein_seq(sitelist)
output_filename = os.path.basename(hits_file).rstrip('.hits')
csv_filepath = os.path.join(outputdir, output_filename + '_analysis.csv')
# Compute sequence coverage percentage, need to set seqcovnum argument in general_seq_plot
seqcovnum, Nindxs, Cindxs, totalsitesnum = seq_cov_number(protein_seq, sitelist)
# Compute sequence coverage and plot/write to file
n_ints, c_ints, max_int = compute_seq_cov(protein_seq, sitelist, norm=True, cz_iontypes_only=False)
datalists = [n_ints, c_ints]
labels = ['n','c']
indices = np.arange(0, len(datalists[0]))
index = 0
for data in datalists:
if index == 0:
ax[subplot].bar(indices, data, label=labels[index])
else:
# need to sum all previously plotted lists to get correct bottom coordinates
summedlist = [sum(x) for x in zip(*datalists[0:index])]
ax[subplot].bar(indices, data, label=labels[index], bottom=summedlist)
index += 1
ax[len(files)-1].set_xlabel("Sequence Position")
ax[len(files)-1].set_ylabel("Norm. Int")
ax[subplot].set_title(output_filename)
# plotitle = output_filename
#
# plt.title(plotitle)
plt.legend(loc='best')
# plotname = plotitle + ".png"
# plotfilename = os.path.join(outputdir, plotname)
subplot+=1
plt.savefig("testing.png")
plt.close()
def main_seq_cov(files, outputdir, extension='.png', iontype_exclusion=False, errors=False, outfile = False, outfile_extension=None, cz_iontypes_only=False):
"""
Run sequence coverage analysis on the provided list of .hits files
:param files: (list of strings) full system paths to .hits files to analyze
:param extension: output plot extension (e.g. '.png)
:return: void
"""
# outputdir = os.path.dirname(files[0])
# It used to set the input directory as the output directory...Me no like it
# It gets too confusing. Modified to code so the user can choose the output directory
# outputdir = filedialog.askdirectory(title='Choose Output Folder')
output_header = 'Sample name, seq_cov, max_int, N_Sites, C_Sites\n'
output_str = f"{output_header}"
for index, hits_file in enumerate(files):
print('analyzing file {} of {}'.format(index + 1, len(files)))
sitelist = load_hits_file(hits_file)
protein_seq = get_protein_seq(sitelist)
output_filename = os.path.basename(hits_file).rstrip('.hits')
csv_filepath = os.path.join(outputdir, output_filename + '_analysis.csv')
# Compute sequence coverage percentage, need to set seqcovnum argument in general_seq_plot
seqcovnum, Nindxs, Cindxs, totalsitesnum = seq_cov_number(protein_seq, sitelist)
# Compute sequence coverage and plot/write to file
n_ints, c_ints, max_int = compute_seq_cov(protein_seq, sitelist, norm = True, cz_iontypes_only=cz_iontypes_only)
#Get list of indeces ready for .csv
n_sites = ';'.join(map(str, Nindxs))
c_sites = ';'.join(map(str, Cindxs))
output_str += f"{output_filename},{seqcovnum}, {round(max_int)}, {n_sites}, {c_sites}, {totalsitesnum}\n"
# Plots sequence position vs normalized intensity of sites covered
general_seq_plot([n_ints, c_ints], ['n', 'c'], outputdir, plotitle=output_filename, seqcovnumber=seqcovnum, xtitle='Sequence position',
ytitle='Normed Int', extension=extension)
# Outputs a .csv file with the info necessary to output the graph above
general_seq_write([n_ints, c_ints], ['n', 'c'], seq=protein_seq, title='Sequence coverage',
new_filename=csv_filepath)
seq_plot_cterm([c_ints], ['c'], outputdir, plotitle=output_filename)
seq_plot_nterm([n_ints], ['n'], outputdir, plotitle=output_filename)
if errors:
error_list, error_lists_all = seq_error_analysis(protein_seq, sitelist)
# - There is a ZeroDivisionError
error_scatter_plot(error_list, outputdir, plotitle=output_filename, xtitle="Sequence Position", ytitle="Avg. Error(ppm)")
# error_scatter_plot(error_lists_all, outputdir, plotitle=output_filename, xtitle="Sequence Position",
# ytitle="Error(ppm)")
if outfile and outfile_extension is not None:
output = open("Sequence_cov" + outfile_extension, 'w')
output.write(output_str)
output.close()
# print(output_str)
def seq_cov_overlay(list_hits_files, norm_bool):
"""
Generate an overlay plot given a set of .hits files. Recommended to use less than 10 or so
hits files to keep things from getting too crowded
:param list_hits_files: list of strings (full system paths to hits files to overlay)
:param norm_bool: whether to normalize the data or not
:return: void
"""
outputdir = os.path.dirname(list_hits_files[0])
plt.clf()
plt.figure(dpi=600)
ax1 = plt.subplot(2, 1, 1)
ax2 = plt.subplot(212, sharex=ax1)
for hits_file in list_hits_files:
sitelist = load_hits_file(hits_file)
protein_seq = get_protein_seq(sitelist)
output_filename = os.path.basename(hits_file).rstrip('.hits')
# Compute sequence coverage and plot/write to file
n_ints, c_ints = compute_seq_cov(protein_seq, sitelist, norm=norm_bool)
# Plot sequence position vs normalized intensity of sites covered
seq_indices = np.arange(1, len(n_ints) + 1)
ax1.plot(seq_indices, n_ints, linewidth=0.8)
ax1.axes.xaxis.set_visible(False)
cv_label = parse_funcs_filename(output_filename)
ax2.plot(seq_indices, c_ints, linewidth=0.8, label=cv_label)
# rearrange subplots to fit legend at the side
box1 = ax1.get_position()
ax1.set_position([box1.x0, box1.y0, box1.width * 0.95, box1.height])
box2 = ax2.get_position()
ax2.set_position([box2.x0, box2.y0, box2.width * 0.95, box2.height])
plt.legend(loc='center left', bbox_to_anchor=(1, 1), fancybox=True, fontsize=8)
plt.xlabel('Seq Position')
plt.ylabel('Intensity')
plotname = 'overlay.png'
plotfilename = os.path.join(outputdir, plotname)
plt.savefig(plotfilename)
plt.close()
def parse_funcs_filename(filename):
"""
Convert function number from IMTBX to CV (specifically for DP fragment data)
:param filename: string
:return: string of format '<voltage> V'
"""
splits = filename.split('.')
func_num = int(splits[0][-3:])
cv = 10 + (func_num - 1) * 5
return '{}V'.format(cv)
def main_seqcov_iontype(files, outputdir, neutloss = False):
"""
Run sequence coverage analysis by ion_type on the provided list of .hits files
:param files: (list of strings) full system paths to .hits files to analyze
:param outputdir: Directory to save output
:param neutloss: If neutral losses want to be considered (Pyteomics calculates neut losses both both CID and ExD ions)
:return: void
"""
#loop over files
for index, hits_file in enumerate(files):
print('analyzing file {} of {}'.format(index + 1, len(files)))
# Read the pickel file and get protein sequence
sitelist = load_hits_file(hits_file)
protein_seq = get_protein_seq(sitelist)
output_filename = os.path.basename(hits_file).rstrip('.hits')
csv_filepath = os.path.join(outputdir, output_filename + '_analysis.csv')
# Compute sequence coverage percentage
seqcovnum = str(seq_cov_number(protein_seq, sitelist))
# Compute sequence coverage and plot/write to file
if neutloss:
a_ratios, b_ratios, c_ratios, x_ratios, y_ratios, z_ratios = compute_seq_cov_iontype(
protein_seq, sitelist)
else:
a_ratios, b_ratios, c_ratios, x_ratios, y_ratios, z_ratios = compute_seq_cov_iontype_noneutloss(
protein_seq, sitelist)
# Plots sequence position vs normalized intensity of sites covered
general_seq_plot([a_ratios, b_ratios, c_ratios, x_ratios, y_ratios, z_ratios], ['a', 'b','c', 'x', 'y', 'z'], outputdir, plotitle=output_filename, seqcovnumber = seqcovnum,
xtitle='Sequence position', ytitle='Normed Int')
# Outputs a .csv file with the info necessary to output the graph above
general_seq_write([a_ratios, b_ratios, c_ratios, x_ratios, y_ratios, z_ratios], ['a', 'b','c', 'x', 'y', 'z'], seq=protein_seq, title='Sequence coverage',
new_filename=csv_filepath)
# error_list, error_lists_all = seq_error_analysis(protein_seq, sitelist) - There is a ZeroDivisionError
# error_scatter_plot(error_list, outputdir, plotitle=output_filename, xtitle=None, ytitle=None)
def main_seq_cov_overlayn(files, outputdir):
"""
Run sequence coverage analysis on the provided list of .hits files (only n-term half of the protein is considered)
:param files: (list of strings) full system paths to .hits files to analyze
:return: void
"""
# Initialize frame outside of files loop in order to overlay
plt.figure('overalyn', dpi=200)
# Clear frame settings, just in case
plt.clf()
# each file gets a counter which will shift the x-values so that the overlaying works
counter = 0
# loop over files
for hits_file in files:
outputfilename = os.path.basename(hits_file).rstrip('.hits')
# Set counter
counter += .45
# Read the pickel file and get protein sequence
sitelist = load_hits_file(hits_file)
protein_seq = get_protein_seq(sitelist)
# Compute sequence coverage percentage
seqcovnum = str(seq_cov_number(protein_seq, sitelist[:len(sitelist)//2]))
# Create legend
legend_str = os.path.basename(hits_file).rstrip('.hits')
# Creating legend for graph - still work in progress
#Currently skipping the date....Needs to be more general
legend = legend_str[13:]
# Compute sequence coverage and plot/write to file
n_ints, c_ints = compute_seq_cov(protein_seq, sitelist)
# It works if output is enclosed in [], I guess making a list
intensities = [n_ints]
# Initialize x-values
indices = np.arange(0, len(intensities[0]))
#Creat graph
for data in intensities:
plt.bar(indices[:len(indices) // 2] + counter, data[:len(data) // 2], width=0.45, align = 'edge', label=[legend + ' {}%'.format(seqcovnum)], alpha=0.75)
#Graph details
plt.xlabel('Sequence Position')
plt.ylabel('Relative Intensity')
plt.title('N-term Overlay')
plt.legend(loc = 'best')
# Plotname will be asked to the user, but to simplify things...not for now
plotname = outputfilename + 'Ntermoveraly' + '.png'
plotfilename = os.path.join(outputdir, plotname)
plt.savefig(plotfilename)
# plt.show() - It works awesomely (able to zoom in and such), however once I close the plot the program is stuck...- CRR
plt.close()
def main_seq_cov_overlayc(files, outputdir, outputfilename):
"""
Run sequence coverage analysis on the provided list of .hits files (only c-term half of the protein is considered)
:param files: (list of strings) full system paths to .hits files to analyze
:return: void
"""
# Initialize frame out side of files loop in order to overlay
plt.figure('overalyn', dpi=200)
# Clear frame settings, just in case
plt.clf()
# each file gets a counter which will shift the x-values so that the overlaying works
counter = 0
# loop over files
for hits_file in files:
# Set counter
counter += 0.45
# Read the pickel file and get protein sequence
sitelist = load_hits_file(hits_file)
protein_seq = get_protein_seq(sitelist)
# Compute sequence coverage percentage
seqcovnum = str(seq_cov_number(protein_seq, sitelist[len(sitelist)//2:]))
# Create legend
legend_str = os.path.basename(hits_file).rstrip('.hits')
# Creating legend for graph - still work in progress
legend = legend_str[13:]
# Compute sequence coverage and plot/write to file
n_ints, c_ints = compute_seq_cov(protein_seq, sitelist)
# It works is out put is enclosed in [], I guess making a list
intensities = [c_ints]
# Initialize x-values
indices = np.arange(0, len(intensities[0]))
for data in intensities:
plt.bar(indices[len(indices) // 2:] + counter, data[len(data) // 2:], width=0.45, align = 'edge', label=[legend + ' {}%'.format(seqcovnum)], alpha=0.75)
plt.xlabel('Sequence Position')
plt.ylabel('Relative Intensity')
plt.title('C-term Overlay')
plt.legend(loc = 'best')
# Plotname will be asked to the user, but to simplify things...not for now
plotname = outputfilename + 'Ctermoveraly' + '.png'
plotfilename = os.path.join(outputdir, plotname)
plt.savefig(plotfilename)
# plt.show() - It works awesomely, however once I close the plot the program is stuck... - CRR
plt.close()
def main_seq_cov_combo(files, outputdir):
"""
Run sequence coverage analysis on combined lists of two .hits files
:param files: (list of strings) full system paths to .hits files to analyze
:return: void
"""
#From the two .hits files selected obtain specific information to perform the combination
sitelist1, protein_seq1, output_filename1, sitelist2, protein_seq2, output_filename2 = unpack_hitsfile(files)
#Combine the two datasets and obtaining the new sitelist and sequence coverag percentage
sitelist3, seqcovnum3 = combo_sitelists(sitelist1, sitelist2, protein_seq1, protein_seq2)
output_filename = f'{output_filename1}-{output_filename2}_combo'
csv_filepath = os.path.join(outputdir, output_filename + '-analysis.csv')
# Compute sequence coverage and plot/write to file
n_ints, c_ints, max_int = compute_seq_cov(protein_seq1, sitelist3)
# Plots sequence position vs normalized intensity of sites covered
general_seq_plot([n_ints, c_ints], ['n', 'c'], outputdir, plotitle=output_filename, seqcovnumber=seqcovnum3,
xtitle='Sequence position', ytitle='Normed Int')
# Outputs a .csv file with the info necessary to output the graph above
general_seq_write([n_ints, c_ints], ['n', 'c'], seq=protein_seq1, title='Sequence coverage',
new_filename=csv_filepath)
#Save combined hits files
save_fragments(sitelist3, output_filename + '.hits')
def combo_sitelists(site_list1, site_list2, protein_seq1, protein_seq2):
"""
Merge site_list from two different hits_file in order to combine sequencing datasets
:param sitelist1: Sitelist from hits-file1
:param sitelist2: Sitelist from hits-file2
:param protein_seq1: from hist_file1
:param protein_seq2: from hist_file2
:return:
"""
#Only allow combination of two datasets of the same protein
if protein_seq1 == protein_seq2:
# Make a copy of the first site_list
site_list3 = site_list1[:]
for site3 in site_list3:
for site2 in site_list2:
if site3.term == site2.term:
if site3.seq_index == site2.seq_index:
site3.hits += site2.hits
seqcovnum3 = str(seq_cov_number(protein_seq1, site_list3))
else:
print("Protein Sequences should be the same!")
return site_list3, seqcovnum3
def unpack_hitsfile(files):
"""
Obtains the site list form each hits file inputed by user (Max two)
:param files: (list of strings) full system paths to .hits files to analyze
:return: site list, protein_seq and output filename for each .hits file
"""
hitsfileIndex = 1
for hits_file in files:
if hitsfileIndex == 1:
sitelist1 = load_hits_file(hits_file)
protein_seq1 = get_protein_seq(sitelist1)
output_filename1 = os.path.basename(hits_file).rstrip('.hits')
hitsfileIndex += 1
elif hitsfileIndex == 2:
sitelist2 = load_hits_file(hits_file)
protein_seq2 = get_protein_seq(sitelist2)
output_filename2 = os.path.basename(hits_file).rstrip('.hits')
else:
print("No more than two files can be combined!!!")
return sitelist1, protein_seq1, output_filename1, sitelist2, protein_seq2, output_filename2
def compute_seq_cov(protein_seq, site_list, norm=False, cz_iontypes_only=False):
"""
Compute and plot sequence coverage for a protein sequence given experimental hit list.
:param protein_seq: protein amino acid sequence string
:param site_list: list of FragmtSite objects containing hit information
:param norm: set to True to return absolute intensities rather than normalized
:return: n-terminal normalized intensities, c-terminal normalized intensities. Both in lists from n-terminal (ready
for plotting).
"""
bad_iontypes = ["z+2", "z+3", "z-H2O", "x-H2O", "c-H2O", "z-NH3", "c-NH3", "x-NH3"]
cz_iontypes = ['z', 'c']
# find max intensity for normalization
max_int = get_max_int(site_list)
if max_int == 0:
max_int = 1e-10
# compute normalized intensity at each site and in list by site index
frag_sites_len = len(protein_seq) + 1 # Extra entry to include the complete sequence
c_norm_ints = np.zeros(frag_sites_len)
n_norm_ints = np.zeros(frag_sites_len)
for site in site_list:
# get total intensity at this site and normalize it against max found in any site
total_int = 0
hitcharge = 0
for hit in site.hits:
currentcharge= hit.thy_ion.charge
if cz_iontypes_only:
if hit.thy_ion.iontype[0] in cz_iontypes:
if hitcharge == currentcharge:
continue
else:
total_int += float(hit.exp_ion.pkar_cluster)
else:
if hitcharge == currentcharge:
continue
else:
total_int += float(hit.exp_ion.pkar_cluster)
hitcharge = hit.thy_ion.charge
if not norm:
normed_int = total_int
else:
normed_int = float(total_int) / max_int
# get terminal and correct so C- and N-terminal ions are using same numbering scheme
site_num = site.get_seq_index(len(protein_seq))
if site.term == 'N':
# numbering starts at 1 and goes upwards. Map to starting at 0 and going up (subtract 1)
n_norm_ints[site_num] = normed_int
elif site.term == 'C':
# length of the sequence is referenced to the opposite terminal and needs to be flipped
c_norm_ints[site_num] = normed_int
else:
print('INVALID TERMINAL! Terminal was ' + site.term)
return n_norm_ints, c_norm_ints, max_int
def compute_seq_cov_iontype_noneutloss(protein_seq, site_list, norm=False):
"""
Compute and plot sequence coverage by iontype for a protein sequence given experimental hit list.
:param protein_seq: protein amino acid sequence string
:param site_list: list of FragmtSite objects containing hit information
:return: n-terminal normalized intensities, c-terminal normalized intensities. Both in lists from n-terminal (ready
for plotting).
"""
# find max intensity for normalization
max_int = get_max_int(site_list)
#Create an array for each ion type
a_ratios = np.zeros(len(protein_seq))
b_ratios = np.zeros(len(protein_seq))
c_ratios = np.zeros(len(protein_seq))
x_ratios = np.zeros(len(protein_seq))
y_ratios = np.zeros(len(protein_seq))
z_ratios = np.zeros(len(protein_seq))
for site in site_list:
site_num = site.get_seq_index(len(protein_seq))
if site.term == 'C':
x_int = 0
y_int = 0
z_int = 0
for hit in site.hits:
# to no considering neutral losses don't index at 0
if hit.thy_ion.iontype == 'x':
x_int += float(hit.exp_ion.pkar_cluster)
elif hit.thy_ion.iontype == 'y':
y_int += float(hit.exp_ion.pkar_cluster)
elif hit.thy_ion.iontype == 'z' or hit.thy_ion.iontype == 'z-dot':
z_int += float(hit.exp_ion.pkar_cluster)
else:
print('INVALID ION-TYPE! Ion-type was {}'.format(hit.thy_ion.iontype))
if not norm:
x_ratio = x_int
x_ratios[site_num - 1] = x_ratio
y_ratio = y_int
y_ratios[site_num - 1] = y_ratio
z_ratio = z_int
z_ratios[site_num - 1] = z_ratio
else:
# get total intensity at site per ion type and normalize it against max found in any site
x_ratio = float(x_int) / max_int
x_ratios[site_num - 1] = x_ratio
y_ratio = float(y_int) / max_int
y_ratios[site_num - 1] = y_ratio
z_ratio = float(z_int) / max_int
z_ratios[site_num - 1] = z_ratio
elif site.term == 'N':
a_int = 0
b_int = 0
c_int = 0
# check for a and b fragments and compare cluster total areas
for hit in site.hits:
if hit.thy_ion.iontype == 'a':
a_int += float(hit.exp_ion.pkar_cluster)
elif hit.thy_ion.iontype == 'b':
b_int += float(hit.exp_ion.pkar_cluster)
elif hit.thy_ion.iontype == 'c' or hit.thy_ion.iontype == 'c-dot':
c_int += float(hit.exp_ion.pkar_cluster)
else:
print('INVALID ION-TYPE! Ion-type was {}'.format(hit.thy_ion.iontype))
if not norm:
a_ratio = a_int
a_ratios[site_num - 1] = a_ratio
b_ratio = b_int
b_ratios[site_num - 1] = b_ratio
c_ratio = c_int
c_ratios[site_num - 1] = c_ratio
else:
a_ratio = float(a_int) / max_int
a_ratios[site_num-1] = a_ratio
b_ratio = float(b_int) / max_int
b_ratios[site_num-1] = b_ratio
c_ratio = float(c_int) / max_int
c_ratios[site_num-1] = c_ratio
else:
print('INVALID TERMINAL! Terminal was ' + site.term)
return a_ratios, b_ratios, c_ratios, x_ratios, y_ratios, z_ratios
def compute_seq_cov_iontype(protein_seq, site_list, norm =False):
"""
Compute and plot sequence coverage by iontype for a protein sequence given experimental hit list.
:param protein_seq: protein amino acid sequence string
:param site_list: list of FragmtSite objects containing hit information
:return: n-terminal normalized intensities, c-terminal normalized intensities. Both in lists from n-terminal (ready
for plotting).
"""
# find max intensity for normalization
max_int = get_max_int(site_list)
#Create an array for each ion type
a_ratios = np.zeros(len(protein_seq))
b_ratios = np.zeros(len(protein_seq))
c_ratios = np.zeros(len(protein_seq))
x_ratios = np.zeros(len(protein_seq))
y_ratios = np.zeros(len(protein_seq))
z_ratios = np.zeros(len(protein_seq))
for site in site_list:
site_num = site.get_seq_index(len(protein_seq))
if site.term == 'C':
x_int = 0
y_int = 0
z_int = 0
for hit in site.hits:
# to no considering neutral losses don't index at 0
if hit.thy_ion.ion_type[0] == 'x':
x_int += float(hit.exp_ion.pkar_cluster)
elif hit.thy_ion.ion_type[0] == 'y':
y_int += float(hit.exp_ion.pkar_cluster)
elif hit.thy_ion.ion_type[0] == 'z':
z_int += float(hit.exp_ion.pkar_cluster)
else:
print('INVALID ION-TYPE! Ion-type was {}'.format(hit.thy_ion.iontype))
if not norm:
x_ratio = x_int
x_ratios[site_num - 1] = x_ratio
y_ratio = y_int