-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmeta_analysis_data.py
374 lines (299 loc) · 17.4 KB
/
meta_analysis_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/env python
import argparse as ap
import numpy as np
import pandas as pd
import glob, os, sys
from scipy import stats as sts
def read_params():
p = ap.ArgumentParser(description=\
"Utility for creating a metadata table to perform a meta-analysis. \n"
"This program has two main scopes: a) if you specify as the input_folder argument \n"
"a combinedMetadata table, it will merge the population queried starting from that \n"
"and therefore will store no-data. In this case, data will be \n"
"retrievable using another script: '[python] call_data.py' which takes \n"
"a metadata table and a type of data as input and ensures "
"you get your data. The other usage for this program is "
"\nto pass as a first argument a folder which contains metadata + data. "
"\nIn this second settings, the program will subset the load of \n"
"data i input_folder for you, returning a metadata + data table directly."
"\n\n*** Example usages *** \n\n"
"The following 2 examples return \nthe datasets for 2 analyses "
"described in the paper \n'Manghi, Schiffer, et al, 2021 in press.'\n"
""
"\nExample a):"
"\n==> returns 4207 samples used to "
"study the microbiome related differences between "
"males & females\n\n python meta_analysis_data.py "
"combinedMetadata.tsv sex_contrast_meta_analysis.tsv "
" -min age:16 \\\n --cat study_condition:control "
"body_site:stool \\\n -mp gender:25 \\\n -cf BMI \\\n -mm gender:40 "
" --verbose \n"
""
"\nExample b)"
"\n==> return 3385 samples used to "
"study microbiome relationships with ageing. "
"\n(Imagine to run this having metadata + data rel.abuns. "
"previously stored in rel_abundances/):\n "
"\n python meta_analysis_data.py rel_abundances/ "
"ageing_dataset_ma.tsv \\\n -min age:16 \\\n --cat study_condition:control "
"body_site:stool \\\n -iqr age:15 \\\n -cf BMI \\\n --debug\n\n"
"\n The flags used in these two commands"
"serve for:"
"\n -min age:16 populations will have minimum 16 year"
"\n -cat body_site:stool: populations'll have body_site==stool"
"\n -mp gender:25 least abun. class inside 'gender' will"
"cover at least 25 percent of the total"
"\n -iqr age:15: in each population, age will have an IQR of at least 15"
"\n"
"\nThere is no limit to each argument: e.g. you can select as mimum of maximum as you want.\n"
"There other are parameters which are extremely usefull:\n "
"\n--search a:b:c:d will select only samples and population having b,c or d under ciolumn a"
"\n--minmin set a len minium required for the population ('-mm 40') or a minimum for the least abundant class ('-mm gender:40')"
"\n"
"--multiple. If negative (-1, default) allows just baseline samples. If an int, "
"uses all samples inside the 'multiple' upper bound."
"\n\n"
"\n** IMPORTANT **\n"
"\n\n"
"When a flag is activated, it will also exclude the "
"\nNaN values in the column it refers to: e.g.\n "
"--min age:16 will exclude samples younger than 16 AND samples having age==NaN.\n"
"Therefore: \n"
"\n-cf/--cfd (confounder) flag simply ensure that the data are non-NaN in the column chosen.\n"
"\n You call any argument as many times as you want BUT MULTIPLE", \
formatter_class=ap.RawTextHelpFormatter)
add = p.add_argument
add("input_folder", type=str, help=\
"Folder where the tables of metadata "
"+ abundances are stored\n "
"tables inside this folder will be "
"filtered for selection \n(it can have more "
"tables than) the ones you need.\n "
"An alternative way is to specify "
"under this parameter\n "
"the combinedMetadata table, a table which "
"contains \nall the metadata in this package.\n "
"This last option is automatically "
"activated when the first argument ends in "
"'.tsv'. \n"
"Consider, now, that if you specify this option, "
" your dataset will be tailored \nas "
"well for your analysis, but it will\n "
"be constituted solely by metadata.")
add("output_dataset", type=str, help="The name of the table with the population "
"and relative abundances for your meta-analysis")
#add("-z", "--featid", type=str, help="This keyword allows to distinguish features from metadata, a brief examples:"
# "\n 's__' is for species\n'g__' is for genera\n'PWY' is for pathways\n'K' if for KEGG-profiles")
add("-min", "--min", type=str, nargs="+", default=[], help="Use ':' ==> age:16 means minimum 16 in the columns age")
add("-max", "--max", type=str, nargs="+", default=[], help="Use ':' ==> BMI:45 means maximum 45 in the columns BMI")
add("-iqr", "--iqr", type=str, nargs="+", default=[], help="Use ':' ==> age:10 means asking at least 10 as IQR for age in this population")
add("-mul", "--multiple", type=int, default=-1, help="Upper bound of days from the baseline allowed (default -1 uses only baselines, while 0 uses all)")
add("-cat", "--cat", type=str, nargs="+", default=[], help="Use ':' ==> study_condition:control means asking just controls")
add("-ex", "--exclude", type=str, nargs="+", default=[], help="Works like --cat, but for exclusion (useful mixed with --search)")
add("-search", "--search", type=str, nargs="+", default=[], help="Use ':' ==> search:study_condition:a:b:c:etc asks for a or b or c or etc in column 'study_condition'")
add("-bn", "--binary", type=str, nargs="+", default=[], help="Use ':' (-bn cls:pos:prevpos:neg) ==> under column cls, substitute everything == prevpos with pos, "
"and everything else with neg (useful mixed with ex or cat)")
add("-mp", "--min_perc", type=str, nargs="+", default=[], help="Use ':' ==> column-name:25 searches at least min 25%% in the less abundance class under column-name")
add("-mm", "--minmin", type=str, default="0", help="Can have 2 shapes: gender:40 requires a minimum of 40 individuals in the less abundance sex "
", while just 40 requires a minimum of 40 individuals in total (This keyword is very important because many dataset are quite small for a meta-analysis)")
add("-cf", "--cfd", type=str, nargs="+", default=[], help="Use words, such as: BMI age gender: this will simply exclude samples **without** these confounders")
add("-si", "--study_identifier", type=str, default="study_name", help="Name of the column identifing Dataset [default: study_name]")
add("-v", "--verbose", action="store_true", help="Tell you what is adding and how big it is")
add("--debug", action="store_true", help="Similar to verbose, but prints more stuff")
return vars(p.parse_args())
def handle_input(input_argument, studyID, verbose):
data_2_tables = {}
if input_argument.endswith(".tsv"):
if os.path.isfile(input_argument):
metmet = pd.read_csv(input_argument, sep="\t", header=0, index_col=0, low_memory=False, engine="c").fillna("NA")
for study in metmet[studyID].unique().tolist():
data_2_tables[study] = metmet.loc[metmet[studyID]==study, :].T
else:
raise FileNotFoundError("Sorry: you input a combinedMetadata.tsv table string which is not present in cur dir. Exiting")
else:
if not os.path.isdir( input_argument ):
raise FileNotFoundError("The directory you set as the input folder is not there. Exiting.")
elif os.path.isdir( input_argument ) and (not len(os.listdir( input_argument ))):
raise IndexError( "The directory you set up as a input folder exists, but is empty. Exiting.")
else:
for table in glob.glob(os.path.join(input_argument, "*")):
data_2_tables[os.path.basename(table.replace(".tsv", ""))] = pd.read_csv(table, sep="\t", header=0, index_col=0, low_memory=False, engine="c").fillna("NA")
return data_2_tables
def select(args):
madata = []
import numpy
print(args)
metadata_dict = handle_input(args["input_folder"], args["study_identifier"], args["verbose"])
for name in metadata_dict:
tabtab = metadata_dict[name] #pd.read_csv(table, sep="\t", header=0, index_col=0, low_memory=False, engine="c").fillna("NA")
tabtab.index.name = "sample_id"
tabtab.columns = [ (dt + "_" + c) for c,dt in zip(tabtab.columns.tolist(), tabtab.loc[args["study_identifier"]].tolist()) ]
study = tabtab.loc[args["study_identifier"]].tolist()[0]
print(study)
if args["min"] and (not isinstance(tabtab, np.ndarray)):
for ag in args["min"]:
ag = ag.split(":")
col = ag[0]
min_ = int(ag[1])
if col in tabtab.index.tolist():
tabtab = tabtab.loc[ :, tabtab.loc[col]!="NA" ]
tabtab = tabtab.loc[ :, tabtab.loc[col].astype(float).astype(int)>=min_ ]
if args["debug"]:
sys.stdout.write("\nMin ==> " + str(tabtab.shape) + "[" + study + "]")
else:
tabtab = np.array([])
if args["max"] and (not isinstance(tabtab, np.ndarray)):
for ag in args["max"]:
ag = ag.split(":")
col = ag[0]
max_ = int(ag[1])
if col in tabtab.index.tolist():
tabtab = tabtab.loc[ :, tabtab.loc[col]!="NA" ]
tabtab = tabtab.loc[ :, tabtab.loc[col]<=max_ ]
if args["debug"]:
sys.stdout.write("\nMAX ==> " + str(tabtab.shape) + "[" + study + "]")
else:
tabtab = np.array([])
if args["cat"] and (not isinstance(tabtab, np.ndarray)):
for ct in args["cat"]:
ct = ct.split(":")
col = ct[0]
ctgs = ct[1:]
if col in tabtab.index.tolist():
tabtab = tabtab.loc[ :, tabtab.loc[col]!="NA" ]
tabtab = tabtab.loc[ :, tabtab.loc[col].isin(ctgs) ]
if args["debug"]:
sys.stdout.write("\nCATG ==> " + str(tabtab.shape) + "[" + study + "]")
else:
tabtab = np.array([])
if args["exclude"] and (not isinstance(tabtab, np.ndarray)):
for ag in args["exclude"]:
ag = ag.split(":")
col = ag[0]
exc = ag[1]
if col in tabtab.index.tolist():
tabtab = tabtab.loc[ :, ~tabtab.loc[col].isin([exc])]
if args["cfd"] and (not isinstance(tabtab, np.ndarray)):
for confounder in args["cfd"]:
if (not isinstance(tabtab, np.ndarray)) and (confounder in tabtab.index.tolist()):
tabtab = tabtab.loc[ :, tabtab.loc[confounder]!="NA" ]
else:
tabtab = np.array([])
if args["search"] and (not isinstance(tabtab, np.ndarray)):
for ag in args["search"]:
col = ag.split(":")[0]
arrgs = ag.split(":")[1:]
if col in tabtab.index.tolist():
#if (not "NA" in arrgs):
tabtab = tabtab.loc[ :, tabtab.loc[col]!="NA" ]
tabtab = tabtab.loc[ :, tabtab.loc[col].isin(arrgs) ]
if args["debug"]:
sys.stdout.write("\nSEarch ==> " + str(tabtab.shape) + "[" + study + "]")
else:
tabtab = np.array([])
if args["multiple"] and (not isinstance(tabtab, np.ndarray)):
if ("days_from_first_collection" in tabtab.index.tolist()) and \
(len(tabtab.loc["days_from_first_collection"].unique()) >1):
if args["multiple"] < 0.:
tabtab.loc["days_from_first_collection"] = [ (0.0 if str(n)=="NA" else float(n)) \
for n in tabtab.loc["days_from_first_collection"].tolist() ]
tabtab = tabtab.loc[ :, tabtab.loc["days_from_first_collection"] == 0.0 ]
tabtab = tabtab.T.drop_duplicates(["subject_id"], keep="first").T
if args["debug"]:
sys.stdout.write("\nDROP ==> " + str(tabtab.shape) + "[" + study + "]")
sys.stdout.write("\nMULT ==> " + str(tabtab.shape) + "[" + study + "]")
else:
tabtab = tabtab.loc[ :, tabtab.loc["days_from_first_collection"] <= args["multiple"] ]
else:
if args["multiple"] < 0.:
tabtab = tabtab.T.drop_duplicates(["subject_id"], keep="first").T
if args["debug"]:
#exit(1)
sys.stdout.write("\nDROP ==> " + str(tabtab.shape) + "[" + study + "]")
#else:
# tabtab = np.array([])
if not tabtab.shape[1]:
tabtab = np.array([])
else:
if args["debug"]:
sys.stdout.write("\n!! NO-DROP ==> " + str(tabtab.shape) + "[" + study + "]")
if (not isinstance(tabtab, np.ndarray)) and (not tabtab.shape[1]):
tabtab = np.array([])
if args["binary"] and (not isinstance(tabtab, np.ndarray)):
for bn in args["binary"]:
cls, pos, prevpos, neg = bn.split(":")
if cls in tabtab.index.tolist():
#if neg != "NA":
# tabtab = tabtab.loc[ :, tabtab.loc[cls]!="NA" ]
tabtab.loc[ cls ] = [( pos if (nm==prevpos) else neg ) for nm in tabtab.loc[ cls ].tolist() ]
else:
tabtab = np.array([])
if args["min_perc"] and (not isinstance(tabtab, np.ndarray)):
for mp in args["min_perc"]:
col, perc = mp.split(":")
perc = float(perc)
if col in tabtab.index.tolist():
tabtab = tabtab.loc[ :, tabtab.loc[col]!="NA" ]
if tabtab.shape[1]:
tabtab = tabtab \
if \
((np.min([((tabtab.loc[:, tabtab.loc[col]==z].shape[1]/float(tabtab.shape[1]))*100) for z in tabtab.loc[col].unique()])>=perc) \
and (len(tabtab.loc[col].unique())>1)) \
else \
np.array([])
else:
tabtab = np.array([])
if args["debug"]:
sys.stdout.write("\nMIN PERC ==> " + str(tabtab.shape) + "[" + study + "]")
else:
tabtab = np.array([])
if args["iqr"] and (not isinstance(tabtab, np.ndarray)):
for iqr in args["iqr"]:
iqr = iqr.split(":")
col = iqr[0]
num = float(iqr[1])
if (not isinstance(tabtab, np.ndarray)) and (col in tabtab.index.tolist()):
tabtab = tabtab.loc[ :, tabtab.loc[col]!="NA" ]
tabtab = tabtab if (sts.iqr(tabtab.loc[ col ].values.astype(float)) >= num) else np.array([])
if args["debug"]:
sys.stdout.write("\nIQR ==> " + str(tabtab.shape) + "[" + study + "]")
else:
tabtab = np.array([])
if args["minmin"] and (not isinstance(tabtab, np.ndarray)):
if len(args["minmin"].split(":")) == 2:
minmin = args["minmin"].split(":")
col = minmin[0]
mm = int(minmin[1])
if tabtab.shape[1]:
#### print(study, [ tabtab.loc[col, tabtab.loc[col]==u].shape[0] for u in tabtab.loc[col].unique() ] )
tabtab = tabtab \
if (((np.min([ tabtab.loc[col, tabtab.loc[col]==u].shape[0] for u in tabtab.loc[col].unique()]) >= mm)) \
and ( len(tabtab.loc[col].unique())>1 )) \
else np.array([])
else:
tabtab = np.array([])
if args["debug"]:
sys.stdout.write("\nMIN-MIN ==> " + str(tabtab.shape) + "[" + study + "]")
else:
mm = int(args["minmin"])
tabtab = tabtab if (tabtab.shape[1]>=mm) else np.array([])
if len(tabtab):
if args["verbose"] or args["debug"]:
sys.stdout.write("\nMerging population %s with %i samples - " %(tabtab.loc[args["study_identifier"]].tolist()[0], tabtab.shape[1]))
if (not len(madata)):
madata = tabtab
else:
madata = madata.merge( tabtab, left_index=True, right_index=True, how="outer" )
if not isinstance(madata, list):
madata.fillna(0.0, inplace=True)
if len(madata):
if args["verbose"] or args["debug"]:
sys.stdout.write("\nYour dataset %s has %i samples! Does this make sense?\n" %(args["output_dataset"], madata.shape[1]))
madata.to_csv( args["output_dataset"], sep="\t", header=True, index=True )
else:
raise IndexError("We are sorry: apparently your search is too detailed for this package, and we can't provide any sample with the desired characteristics Exiting")
def main():
args = read_params()
select(args)
if __name__=="__main__":
main()