1
+ import sys , os
2
+ import azure_chestxray_utils
3
+ import pickle
4
+ import random
5
+ import re
6
+ import tqdm
7
+ import cv2
8
+ import numpy as np
9
+ import pandas as pd
10
+ import sklearn .model_selection
11
+ from collections import Counter
12
+
13
+ paths_to_append = [os .path .join (os .getcwd (), os .path .join (* (['Code' , 'src' ])))]
14
+ def add_path_to_sys_path (path_to_append ):
15
+ if not (any (path_to_append in paths for paths in sys .path )):
16
+ sys .path .append (path_to_append )
17
+
18
+ [add_path_to_sys_path (crt_path ) for crt_path in paths_to_append ]
19
+
20
+ path = os .getcwd ()+ r'\azure-share'
21
+ isExists = os .path .exists (path )
22
+ if not isExists :
23
+ amlWBSharedDir = os .mkdir (path )
24
+ else :
25
+ amlWBSharedDir = path
26
+
27
+
28
+
29
+
30
+ prj_consts = azure_chestxray_utils .chestxray_consts ()
31
+ print (prj_consts )
32
+
33
+ data_base_input_dir = os .path .join (amlWBSharedDir , os .path .join (* (prj_consts .BASE_INPUT_DIR_list )))
34
+ data_base_output_dir = os .path .join (amlWBSharedDir , os .path .join (* (prj_consts .BASE_OUTPUT_DIR_list )))
35
+
36
+ isExists1 = os .path .exists (data_base_input_dir )
37
+ isExists2 = os .path .exists (data_base_output_dir )
38
+
39
+ if not isExists1 :
40
+ data_base_input_dir = os .mkdir (data_base_input_dir )
41
+ print (data_base_input_dir )
42
+
43
+ if not isExists2 :
44
+ data_base_output_dir = os .mkdir (data_base_output_dir )
45
+ print (data_base_output_dir )
46
+
47
+ nih_chest_xray_data_dir = os .path .join (data_base_input_dir ,
48
+ os .path .join (* (prj_consts .ChestXray_IMAGES_DIR_list )))
49
+ isExists3 = os .path .exists (nih_chest_xray_data_dir )
50
+ if not isExists3 :
51
+ nih_chest_xray_data_dir = os .mkdir (nih_chest_xray_data_dir )
52
+
53
+ print (nih_chest_xray_data_dir )
54
+
55
+ other_data_dir = os .path .join (data_base_input_dir , os .path .join (* (prj_consts .ChestXray_OTHER_DATA_DIR_list )))
56
+ data_partitions_dir = os .path .join (data_base_output_dir , os .path .join (* (prj_consts .DATA_PARTITIONS_DIR_list )))
57
+
58
+ ignored_images_set = set ()
59
+
60
+ total_patient_number = 30805
61
+ NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists
62
+ manually_selected_bad_images_file = 'blacklist.csv' # exclude what viusally looks like bad images
63
+
64
+ patient_id_original = [i for i in range (1 ,total_patient_number + 1 )]
65
+
66
+ bbox_df = pd .read_csv (os .path .join (other_data_dir , NIH_annotated_file ))
67
+ bbox_patient_index_df = bbox_df ['Image Index' ].str .slice (3 , 8 )
68
+
69
+ bbox_patient_index_list = []
70
+ for index , item in bbox_patient_index_df .iteritems ():
71
+ bbox_patient_index_list .append (int (item ))
72
+
73
+ patient_id = list (set (patient_id_original ) - set (bbox_patient_index_list ))
74
+ print ("len of original patient id is" , len (patient_id_original ))
75
+ print ("len of cleaned patient id is" , len (patient_id ))
76
+ print ("len of unique patient id with annotated data" ,
77
+ len (list (set (bbox_patient_index_list ))))
78
+ print ("len of patient id with annotated data" ,bbox_df .shape [0 ])
79
+
80
+ random .seed (0 )
81
+ random .shuffle (patient_id )
82
+
83
+ print ("first ten patient ids are" , patient_id [:10 ])
84
+
85
+ # training:valid:test=7:1:2
86
+ patient_id_train = patient_id [:int (total_patient_number * 0.7 )]
87
+ patient_id_valid = patient_id [int (total_patient_number * 0.7 ):int (total_patient_number * 0.8 )]
88
+ # get the rest of the patient_id as the test set
89
+ patient_id_test = patient_id [int (total_patient_number * 0.8 ):]
90
+ patient_id_test .extend (bbox_patient_index_list )
91
+ patient_id_test = list (set (patient_id_test ))
92
+
93
+ print ("train:{} valid:{} test:{}" .format (len (patient_id_train ), len (patient_id_valid ), len (patient_id_test )))
94
+
95
+ pathologies_name_list = prj_consts .DISEASE_list
96
+ NIH_patients_and_labels_file = 'Data_Entry_2017.csv'
97
+
98
+ labels_df = pd .read_csv (os .path .join (other_data_dir , NIH_patients_and_labels_file ))
99
+
100
+
101
+ #show the label distribution
102
+
103
+ # Unique IDs frequencies can be computed using list comprehension or collections lib
104
+ # [[x,(list(crtData['fullID2'])).count(x)] for x in set(crtData['fullID2'])]
105
+ # for tallying, collections lib is faster than list comprehension
106
+ pathology_distribution = Counter (list (labels_df ['Finding Labels' ]))
107
+
108
+ # Sort it by ID frequency (dict value)
109
+ sorted_by_freq = sorted (pathology_distribution .items (), key = lambda x : x [1 ], reverse = True )
110
+ print (len (sorted_by_freq ))
111
+ print (sorted_by_freq [:20 ])
112
+ print (sorted_by_freq [- 10 :])
113
+
114
+ print (labels_df ['Finding Labels' ].str .split ( '|' , expand = False ).str .join (sep = '*' ).str .get_dummies (sep = '*' ).sum ())
115
+
116
+ def process_data (current_df , patient_ids ):
117
+ image_name_index = []
118
+ image_labels = {}
119
+ for individual_patient in tqdm .tqdm (patient_ids ):
120
+ for _ , row in current_df [current_df ['Patient ID' ] == individual_patient ].iterrows ():
121
+ processed_image_name = row ['Image Index' ]
122
+ if processed_image_name in ignored_images_set :
123
+ pass
124
+ else :
125
+ image_name_index .append (processed_image_name )
126
+ image_labels [processed_image_name ] = np .zeros (14 , dtype = np .uint8 )
127
+ for disease_index , ele in enumerate (pathologies_name_list ):
128
+ if re .search (ele , row ['Finding Labels' ], re .IGNORECASE ):
129
+ image_labels [processed_image_name ][disease_index ] = 1
130
+ else :
131
+ # redundant code but just to make it more readable
132
+ image_labels [processed_image_name ][disease_index ] = 0
133
+ # print("processed", row['Image Index'])
134
+ return image_name_index , image_labels
135
+
136
+
137
+ train_data_index , train_labels = process_data (labels_df , patient_id_train )
138
+ valid_data_index , valid_labels = process_data (labels_df , patient_id_valid )
139
+ test_data_index , test_labels = process_data (labels_df , patient_id_test )
140
+
141
+ print ("train, valid, test image number is:" , len (train_data_index ), len (valid_data_index ), len (test_data_index ))
142
+
143
+ # save the data
144
+ labels_all = {}
145
+ labels_all .update (train_labels )
146
+ labels_all .update (valid_labels )
147
+ labels_all .update (test_labels )
148
+
149
+ partition_dict = {'train' : train_data_index , 'test' : test_data_index , 'valid' : valid_data_index }
150
+
151
+ with open (os .path .join (data_partitions_dir , 'labels14_unormalized_cleaned.pickle' ), 'wb' ) as f :
152
+ pickle .dump (labels_all , f )
153
+
154
+ with open (os .path .join (data_partitions_dir , 'partition14_unormalized_cleaned.pickle' ), 'wb' ) as f :
155
+ pickle .dump (partition_dict , f )
156
+
157
+ # also save the patient id partitions for pytorch training
158
+ with open (os .path .join (data_partitions_dir , 'train_test_valid_data_partitions.pickle' ), 'wb' ) as f :
159
+ pickle .dump ([patient_id_train , patient_id_valid ,
160
+ patient_id_test ,
161
+ list (set (bbox_patient_index_list ))], f )
162
+
163
+ print (type (train_labels ))
164
+ print ({k : train_labels [k ] for k in list (train_labels )[:5 ]})
0 commit comments