Skip to content

Commit f19498b

Browse files
author
yht
committed
test-pr
1 parent 0fe9f6d commit f19498b

31 files changed

+2267
-0
lines changed
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
import sys, os
2+
import azure_chestxray_utils
3+
import pickle
4+
import random
5+
import re
6+
import tqdm
7+
import cv2
8+
import numpy as np
9+
import pandas as pd
10+
import sklearn.model_selection
11+
from collections import Counter
12+
13+
paths_to_append = [os.path.join(os.getcwd(), os.path.join(*(['Code', 'src'])))]
14+
def add_path_to_sys_path(path_to_append):
15+
if not (any(path_to_append in paths for paths in sys.path)):
16+
sys.path.append(path_to_append)
17+
18+
[add_path_to_sys_path(crt_path) for crt_path in paths_to_append]
19+
20+
path= os.getcwd()+r'\azure-share'
21+
isExists=os.path.exists(path)
22+
if not isExists:
23+
amlWBSharedDir = os.mkdir(path)
24+
else:
25+
amlWBSharedDir = path
26+
27+
28+
29+
30+
prj_consts = azure_chestxray_utils.chestxray_consts()
31+
print(prj_consts)
32+
33+
data_base_input_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))
34+
data_base_output_dir=os.path.join(amlWBSharedDir, os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))
35+
36+
isExists1 = os.path.exists(data_base_input_dir)
37+
isExists2 = os.path.exists(data_base_output_dir)
38+
39+
if not isExists1:
40+
data_base_input_dir = os.mkdir(data_base_input_dir)
41+
print(data_base_input_dir)
42+
43+
if not isExists2:
44+
data_base_output_dir = os.mkdir(data_base_output_dir)
45+
print(data_base_output_dir)
46+
47+
nih_chest_xray_data_dir=os.path.join(data_base_input_dir,
48+
os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))
49+
isExists3 = os.path.exists(nih_chest_xray_data_dir)
50+
if not isExists3:
51+
nih_chest_xray_data_dir = os.mkdir(nih_chest_xray_data_dir)
52+
53+
print(nih_chest_xray_data_dir)
54+
55+
other_data_dir=os.path.join(data_base_input_dir, os.path.join(*(prj_consts.ChestXray_OTHER_DATA_DIR_list)))
56+
data_partitions_dir=os.path.join(data_base_output_dir, os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))
57+
58+
ignored_images_set = set()
59+
60+
total_patient_number = 30805
61+
NIH_annotated_file = 'BBox_List_2017.csv' # exclude from train pathology annotated by radiologists
62+
manually_selected_bad_images_file = 'blacklist.csv'# exclude what viusally looks like bad images
63+
64+
patient_id_original = [i for i in range(1,total_patient_number + 1)]
65+
66+
bbox_df = pd.read_csv(os.path.join(other_data_dir, NIH_annotated_file))
67+
bbox_patient_index_df = bbox_df['Image Index'].str.slice(3, 8)
68+
69+
bbox_patient_index_list = []
70+
for index, item in bbox_patient_index_df.iteritems():
71+
bbox_patient_index_list.append(int(item))
72+
73+
patient_id = list(set(patient_id_original) - set(bbox_patient_index_list))
74+
print("len of original patient id is", len(patient_id_original))
75+
print("len of cleaned patient id is", len(patient_id))
76+
print("len of unique patient id with annotated data",
77+
len(list(set(bbox_patient_index_list))))
78+
print("len of patient id with annotated data",bbox_df.shape[0])
79+
80+
random.seed(0)
81+
random.shuffle(patient_id)
82+
83+
print("first ten patient ids are", patient_id[:10])
84+
85+
# training:valid:test=7:1:2
86+
patient_id_train = patient_id[:int(total_patient_number * 0.7)]
87+
patient_id_valid = patient_id[int(total_patient_number * 0.7):int(total_patient_number * 0.8)]
88+
# get the rest of the patient_id as the test set
89+
patient_id_test = patient_id[int(total_patient_number * 0.8):]
90+
patient_id_test.extend(bbox_patient_index_list)
91+
patient_id_test = list(set(patient_id_test))
92+
93+
print("train:{} valid:{} test:{}".format(len(patient_id_train), len(patient_id_valid), len(patient_id_test)))
94+
95+
pathologies_name_list = prj_consts.DISEASE_list
96+
NIH_patients_and_labels_file = 'Data_Entry_2017.csv'
97+
98+
labels_df = pd.read_csv(os.path.join(other_data_dir, NIH_patients_and_labels_file))
99+
100+
101+
#show the label distribution
102+
103+
# Unique IDs frequencies can be computed using list comprehension or collections lib
104+
# [[x,(list(crtData['fullID2'])).count(x)] for x in set(crtData['fullID2'])]
105+
# for tallying, collections lib is faster than list comprehension
106+
pathology_distribution = Counter(list(labels_df['Finding Labels']))
107+
108+
# Sort it by ID frequency (dict value)
109+
sorted_by_freq = sorted(pathology_distribution.items(), key=lambda x: x[1], reverse=True)
110+
print(len(sorted_by_freq))
111+
print(sorted_by_freq[:20])
112+
print(sorted_by_freq[-10:])
113+
114+
print(labels_df['Finding Labels'].str.split( '|', expand=False).str.join(sep='*').str.get_dummies(sep='*').sum())
115+
116+
def process_data(current_df, patient_ids):
117+
image_name_index = []
118+
image_labels = {}
119+
for individual_patient in tqdm.tqdm(patient_ids):
120+
for _, row in current_df[current_df['Patient ID'] == individual_patient].iterrows():
121+
processed_image_name = row['Image Index']
122+
if processed_image_name in ignored_images_set:
123+
pass
124+
else:
125+
image_name_index.append(processed_image_name)
126+
image_labels[processed_image_name] = np.zeros(14, dtype=np.uint8)
127+
for disease_index, ele in enumerate(pathologies_name_list):
128+
if re.search(ele, row['Finding Labels'], re.IGNORECASE):
129+
image_labels[processed_image_name][disease_index] = 1
130+
else:
131+
# redundant code but just to make it more readable
132+
image_labels[processed_image_name][disease_index] = 0
133+
# print("processed", row['Image Index'])
134+
return image_name_index, image_labels
135+
136+
137+
train_data_index, train_labels = process_data(labels_df, patient_id_train)
138+
valid_data_index, valid_labels = process_data(labels_df, patient_id_valid)
139+
test_data_index, test_labels = process_data(labels_df, patient_id_test)
140+
141+
print("train, valid, test image number is:", len(train_data_index), len(valid_data_index), len(test_data_index))
142+
143+
# save the data
144+
labels_all = {}
145+
labels_all.update(train_labels)
146+
labels_all.update(valid_labels)
147+
labels_all.update(test_labels)
148+
149+
partition_dict = {'train': train_data_index, 'test': test_data_index, 'valid': valid_data_index}
150+
151+
with open(os.path.join(data_partitions_dir, 'labels14_unormalized_cleaned.pickle'), 'wb') as f:
152+
pickle.dump(labels_all, f)
153+
154+
with open(os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle'), 'wb') as f:
155+
pickle.dump(partition_dict, f)
156+
157+
# also save the patient id partitions for pytorch training
158+
with open(os.path.join(data_partitions_dir, 'train_test_valid_data_partitions.pickle'), 'wb') as f:
159+
pickle.dump([patient_id_train, patient_id_valid,
160+
patient_id_test,
161+
list(set(bbox_patient_index_list))], f)
162+
163+
print(type(train_labels))
164+
print({k: train_labels[k] for k in list(train_labels)[:5]})
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import sys, os
2+
import azure_chestxray_utils
3+
import azure_chestxray_keras_utils
4+
from keras.models import load_model
5+
import os
6+
import pickle
7+
import cv2
8+
import numpy as np
9+
import pandas as pd
10+
from keras.models import load_model
11+
from keras.utils import Sequence
12+
from sklearn import metrics
13+
from tensorflow.python.client import device_lib
14+
import keras_contrib
15+
16+
path = os.getcwd()+r'\azure-share'
17+
amlWBSharedDir = path
18+
19+
prj_consts = azure_chestxray_utils.chestxray_consts()
20+
data_base_input_dir=os.path.join(amlWBSharedDir,
21+
os.path.join(*(prj_consts.BASE_INPUT_DIR_list)))
22+
data_base_output_dir=os.path.join(amlWBSharedDir,
23+
os.path.join(*(prj_consts.BASE_OUTPUT_DIR_list)))
24+
weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.MODEL_WEIGHTS_DIR_list)))
25+
fully_trained_weights_dir = os.path.join(data_base_output_dir, os.path.join(*(prj_consts.FULLY_PRETRAINED_MODEL_DIR_list)))
26+
27+
nih_chest_xray_data_dir = os.path.join(data_base_input_dir,
28+
os.path.join(*(prj_consts.ChestXray_IMAGES_DIR_list)))
29+
30+
data_partitions_dir = os.path.join(data_base_output_dir,
31+
os.path.join(*(prj_consts.DATA_PARTITIONS_DIR_list)))
32+
33+
label_path = os.path.join(data_partitions_dir,'labels14_unormalized_cleaned.pickle')
34+
35+
partition_path = os.path.join(data_partitions_dir, 'partition14_unormalized_cleaned.pickle')
36+
37+
model_file_name = 'azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5'
38+
model = load_model(os.path.join(fully_trained_weights_dir, model_file_name))
39+
model.save_weights(os.path.join(fully_trained_weights_dir, 'weights_only_'+model_file_name))
40+
models_file_name= [os.path.join(fully_trained_weights_dir,
41+
'weights_only_azure_chest_xray_14_weights_712split_epoch_054_val_loss_191.2588.hdf5')]
42+
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
43+
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
44+
45+
46+
47+
resized_height = 224
48+
resized_width = 224
49+
num_channel = 3
50+
num_classes = 14
51+
batch_size = 100 #512
52+
53+
def get_available_gpus():
54+
"""
55+
Returns: number of GPUs available in the system
56+
"""
57+
local_device_protos = device_lib.list_local_devices()
58+
return [x.name for x in local_device_protos if x.device_type == 'GPU']
59+
60+
get_available_gpus()
61+
# get number of available GPUs
62+
print("num of GPUs:", len(get_available_gpus()))
63+
64+
num_gpu = get_available_gpus()
65+
# get number of available GPUs
66+
print("num of GPUs:", len(get_available_gpus()))
67+
68+
pathologies_name_list = prj_consts.DISEASE_list
69+
pathologies_name_list
70+
71+
stanford_result = [0.8094, 0.9248, 0.8638, 0.7345, 0.8676, 0.7802, 0.7680, 0.8887, 0.7901, 0.8878, 0.9371, 0.8047,
72+
0.8062, 0.9164]
73+
74+
75+
with open(label_path, 'rb') as f:
76+
labels = pickle.load(f)
77+
78+
with open(partition_path, 'rb') as f:
79+
partition = pickle.load(f)
80+
81+
class DataGenSequence(Sequence):
82+
def __init__(self, labels, image_file_index, current_state):
83+
self.batch_size = batch_size
84+
self.labels = labels
85+
self.img_file_index = image_file_index
86+
self.current_state = current_state
87+
self.len = len(self.img_file_index) // self.batch_size
88+
print("for DataGenSequence", current_state, "total rows are:", len(self.img_file_index), ", len is", self.len)
89+
90+
def __len__(self):
91+
return self.len
92+
93+
def __getitem__(self, idx):
94+
# print("loading data segmentation", idx)
95+
# make sure each batch size has the same amount of data
96+
current_batch = self.img_file_index[idx * self.batch_size: (idx + 1) * self.batch_size]
97+
X = np.empty((self.batch_size, resized_height, resized_width, num_channel))
98+
y = np.empty((self.batch_size, num_classes))
99+
100+
for i, image_name in enumerate(current_batch):
101+
path = os.path.join(nih_chest_xray_data_dir, image_name)
102+
103+
# loading data
104+
105+
img = cv2.resize(cv2.imread(path), (resized_height, resized_width)).astype(np.float16)
106+
X[i, :, :, :] = img
107+
y[i, :] = labels[image_name]
108+
109+
# only do random flipping in training status
110+
if self.current_state == 'train':
111+
# this is different from the training code
112+
x_augmented = X
113+
else:
114+
x_augmented = X
115+
116+
return x_augmented, y
117+
118+
119+
120+
# load test data
121+
X_test = np.empty((len(partition['test']), 224, 224, 3), dtype=np.float16)
122+
y_test = np.empty((len(partition['test']) - len(partition['test']) % batch_size, 14), dtype=np.float16)
123+
124+
for i, npy in enumerate(partition['test']):
125+
if (i < len(y_test)):
126+
# round to batch_size
127+
y_test[i, :] = labels[npy]
128+
129+
print("len of result is", len(y_test))
130+
y_pred_list = np.empty((len(models_file_name), len(partition['test']), 14), dtype=np.float16)
131+
132+
# individual models
133+
for index, current_model_file in enumerate(models_file_name):
134+
print(current_model_file)
135+
# model = load_model(current_model_file)
136+
model = azure_chestxray_keras_utils.build_model(keras_contrib.applications.densenet.DenseNetImageNet121); model.load_weights(current_model_file)
137+
print('evaluation for model', current_model_file)
138+
# y_pred = model.predict(X_test)
139+
140+
y_pred = model.predict_generator(generator=DataGenSequence(labels, partition['test'], current_state='test'),
141+
workers=32, verbose=1, max_queue_size=1)
142+
print("result shape", y_pred.shape)
143+
144+
# add one fake row of ones in both test and pred values to avoid:
145+
# ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
146+
y_test = np.insert(y_test, 0, np.ones((y_test.shape[1],)), 0)
147+
y_pred = np.insert(y_pred, 0, np.ones((y_pred.shape[1],)), 0)
148+
149+
df = pd.DataFrame(columns=['Disease', 'Our AUC Score', 'Stanford AUC Score'])
150+
for d in range(14):
151+
df.loc[d] = [pathologies_name_list[d],
152+
metrics.roc_auc_score(y_test[:, d], y_pred[:, d]),
153+
stanford_result[d]]
154+
155+
df['Delta'] = df['Stanford AUC Score'] - df['Our AUC Score']
156+
df.to_csv(current_model_file + ".csv", index=False)
157+
print(df)

0 commit comments

Comments
 (0)