-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata analysis.py
152 lines (104 loc) · 4.66 KB
/
data analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import csv
import statistics
import cobra
import pandas
import sklearn.linear_model
from matplotlib import pyplot as plt
from sklearn.model_selection import LeaveOneOut
model=cobra.io.read_sbml_model("iJO1366.xml")
expression_values=pandas.read_csv("expression values.csv",index_col='genes')
flux=pandas.read_csv("Rnx-Flux relationship modified for MLR for each reaction.csv")
data=pandas.read_csv("D:/semesters/11/3/list of genes added-Rnx-Flux relationship modified for MLR for each reaction .csv")
rxns_f_list=pandas.read_csv("D:/semesters/11/first data ref 17/reactions.csv")
list_of_systematic_names_of_needed_genes=[]
for item in rxns_f_list:
g_r_rule=model.reactions.get_by_id(item).gene_reaction_rule
if 'or' in g_r_rule and 'and' in g_r_rule:
or_splitted = g_r_rule.split(' or ')
for i in or_splitted:
if not 'and' in i:
list_of_systematic_names_of_needed_genes.append(i)
else:
new_i = i.replace("( ", '')
newer_i = new_i.replace(" )", '')
list_of_systematic_names_of_needed_genes.extend(newer_i.split(' and '))
elif g_r_rule=="":
pass
#print(item ,'has no gene and',g_r_rule)
elif 'or' in g_r_rule:
list_of_systematic_names_of_needed_genes.extend(g_r_rule.split(' or '))
elif 'and' in g_r_rule:
# print(item,'=',g_r_rule)
list_of_systematic_names_of_needed_genes.extend(g_r_rule.split(' and '))
else:
list_of_systematic_names_of_needed_genes.append(g_r_rule)
'length= 126 '
#removing repetetive files:
list_of_systematic_names_of_needed_genes=list(dict.fromkeys
(list_of_systematic_names_of_needed_genes))
list_of_systematic_names_of_needed_genes.to_csv("genes needed.csv")
column_of_reactions=[]
column_of_genes=[]
for n in range(flux.shape[0]):
rxns=flux.iloc[n]["Rnx-Flux relationship"]
rxns = rxns.replace("(", '')
rxns = rxns.replace(")", '')
rxns = rxns.replace(" ", '')
list_of_rxns=[]
if "+" in rxns:
list_of_rxns=rxns.split("+")
elif '-' in rxns:
list_of_rxns=rxns.split("-")
elif ',' in rxns:
list_of_rxns=rxns.split(",")
else:
list_of_rxns.append(rxns)
column_of_reactions.append(list_of_rxns)
genes=[]
for rxn in list_of_rxns:
#print("list_of_rxns: ",list_of_rxns)
#print("rxn: ",rxn)
gene_rule=model.reactions.get_by_id(rxn).gene_reaction_rule
if "or" in gene_rule:
genes.extend(gene_rule.split(" or "))
elif "and" in gene_rule:
genes.extend(gene_rule.split(" and "))
else:
genes.append(gene_rule)
genes=list(set(genes))
column_of_genes.append(genes)
flux_and_genes=flux
flux_and_genes["list of reactions"]=column_of_reactions
flux_and_genes["list of genes"]=column_of_genes
#flux_and_genes.to_csv("list of genes added-Rnx-Flux relationship modified for MLR for each reaction .csv")
flux_and_genes.set_index("Flux Module Name (short)",inplace=True)
conditions=['Acetate','Fructose','Galactose','Glucose','Glycerol','Gluconate',
'Pyruvate','Succinate']
pearson_for_reactions= {}
for reaction in flux_and_genes.index:
Y = flux_and_genes.loc[reaction,conditions]
X = expression_values.loc[flux_and_genes.loc[reaction,"list of genes"],conditions]
X=X.T
loocv = LeaveOneOut()
y_test_for_pearson = []
y_predicted_for_pearson = []
for train_index, test_index in loocv.split(X):
x_train = X.iloc[train_index]
y_train = Y.iloc[train_index]
x_test = X.iloc[test_index]
y_test = Y.iloc[test_index]
MLR_model = sklearn.linear_model.LinearRegression()
MLR_model.fit(x_train, y_train)
predicted = MLR_model.predict(x_test)
y_test_for_pearson.append(y_test.iloc[0])
y_predicted_for_pearson.append(predicted)
y_test_for_pearson_df = pandas.DataFrame(y_test_for_pearson)
y_predicted_for_pearson_df = pandas.DataFrame(y_predicted_for_pearson)
#print('model score: ' , MLR_model.score(X,Y))
#print("y_test_for_pearson_df: ",y_test_for_pearson_df)
pearson = y_test_for_pearson_df.corrwith(y_predicted_for_pearson_df, axis=0)
pearson_for_reactions[reaction] = pearson.iloc[0]
pearson_for_reactions_df=pandas.DataFrame(pearson_for_reactions,index=["pearson"])
pearson_for_reactions_df=pearson_for_reactions_df.T
#pearson_for_reactions_df.columns=["Flux Module Name (short)"]
#print("pearson_for_reactions as df: ",pearson_for_reactions_df)