-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdbgap_eutils_compare_merge.py
73 lines (61 loc) · 3.48 KB
/
dbgap_eutils_compare_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
'''
TODO: create stat file(s) with quality metrics
1. which SRR#s are different between dbGaP and eutils
2. which column headers contain duplicate data
3. which column headers are unique/missing from either eutils or dbGap source
'''
import pandas as pd
import numpy as np
### Import paths
dbgap_file_path='/Users/jdorsheimer/Projects/BDCat/dbGaP-23523/SRA_phs000571/merge_compare/dbGaP_SRARunTable.csv'
eutils_file_path='/Users/jdorsheimer/Projects/BDCat/dbGaP-23523/SRA_phs000571/merge_compare/eutils_SRARunTable.csv'
### Extract header values
dbgap_header = list(pd.read_csv(dbgap_file_path,nrows=0))
eutils_header = list(pd.read_csv(eutils_file_path,nrows=0))
### Create data frames from csv files with headers and sort alphanumerically by SRR#s
dbgap_data = pd.read_csv(dbgap_file_path,header=0,names=dbgap_header).sort_values(by=dbgap_header[0])
eutils_data = pd.read_csv(eutils_file_path,header=0,names=eutils_header).sort_values(by=eutils_header[0])
### Remove empty and zeros columns
dbgap_data = dbgap_data.replace(0,np.nan).dropna(axis=1,how='all')
eutils_data = eutils_data.replace(0,np.nan).dropna(axis=1,how='all')
### Extract first column (SRR#s) from data frames
dbgap_run_list = dbgap_data[dbgap_header[0]].tolist()
eutils_run_list = eutils_data[eutils_header[0]].tolist()
### Select a random SRR# that will be used as a sample to compare data between dbgap and eutils
### Note: the value here could be any SRR#, not just the first one, but the first was chosen for simplicity
sample_id = dbgap_data.iloc[0][0]
### Extract rows corresponding to 'sample_id' in both dbgap and eutils data frames into lists
dbgap_unique_values = dbgap_data.loc[dbgap_data[dbgap_header[0]]==sample_id].values.tolist()[0]
eutils_unique_values = eutils_data.loc[eutils_data[eutils_header[0]]==sample_id].values.tolist()[0]
### Create dictionary for pairing keys and values (data and headers)
temp_dict = {}
### Reselect headers based on removed empty and zeros columns
dbgap_header = list(dbgap_data)
eutils_header = list(eutils_data)
### Create combined lists for headers and data for the above selected SRR#
temp_comb_headers = dbgap_header+eutils_header
temp_comb_unique_values = dbgap_unique_values+eutils_unique_values
### Update dictionary with key:value pairs for data:headers
### This format was chosen because unique data values will only be selected once,
### whereas headers could be selected multiple times given the nature of the data
for i in range(0,len(temp_comb_headers)):
if temp_comb_unique_values[i] not in temp_dict:
temp_dict.update({temp_comb_unique_values[i]: temp_comb_headers[i]})
### Reverse dictionary such that the key:value pairs are now set as headers:data
true_form_dict = {v: k for k, v in temp_dict.items()}
### Extract the headers from temp_dict into a list
stripped_headers = []
for key in true_form_dict.keys():
stripped_headers.append(key)
### Create new dictionary to hold data for specified column headers
stripped_dict = {}
### Add data to dictionary for specified non-duplicated columns
for header in stripped_headers:
if header in dbgap_header:
stripped_dict.update({header: list(dbgap_data[header])})
elif header in eutils_header:
stripped_dict.update({header: list(eutils_data[header])})
### Convert dictionary back into data frame
combined_data = pd.DataFrame(stripped_dict)
### Export data from into csv
combined_data.to_csv(r'/Users/jdorsheimer/Projects/BDCat/dbGaP-23523/SRA_phs000571/merge_compare/combined_SRARunTable.csv', index=False, header=True)