From 686e172ba206dd7a95fa7a65095177ea6762a8d4 Mon Sep 17 00:00:00 2001 From: aibharata Date: Sat, 18 Jul 2020 13:38:30 +0530 Subject: [PATCH] Major Changes Added New Dataprocessors --- .vscode/settings.json | 1 - medicalai/__about__.py | 2 +- medicalai/chief/__init__.py | 4 +- medicalai/chief/callbacks/__init__.py | 17 ++ medicalai/chief/callbacks/custom_callbacks.py | 109 +++++++ medicalai/chief/core.py | 94 ++++-- medicalai/chief/dataloaders/__init__.py | 20 ++ medicalai/chief/dataloaders/data_utils.py | 30 ++ .../chief/dataloaders/dataset_processors.py | 31 ++ .../chief/dataloaders/dataset_visualize.py | 99 +++++++ .../chief/dataloaders/image_sequences.py | 215 ++++++++++++++ .../chief/dataloaders/tf_image_pipelines.py | 162 ++++++++++ medicalai/chief/dataset_prepare.py | 181 ++++++++++- medicalai/chief/model_metrics/modelstats.py | 280 +++++++++++++----- medicalai/chief/networks.py | 25 +- medicalai/chief/nnets/covid_net.py | 5 +- medicalai/chief/nnets/densenet.py | 8 +- medicalai/chief/nnets/inceptionResnet.py | 8 +- medicalai/chief/nnets/inceptionv3.py | 8 +- medicalai/chief/nnets/mobilenet.py | 8 +- medicalai/chief/nnets/mobilenetv2.py | 8 +- medicalai/chief/nnets/resnet.py | 5 +- medicalai/chief/nnets/vgg16.py | 8 +- medicalai/chief/nnets/xception.py | 8 +- medicalai/chief/uFuncs.py | 26 +- setup.py | 4 +- 26 files changed, 1217 insertions(+), 149 deletions(-) create mode 100644 medicalai/chief/callbacks/__init__.py create mode 100644 medicalai/chief/callbacks/custom_callbacks.py create mode 100644 medicalai/chief/dataloaders/__init__.py create mode 100644 medicalai/chief/dataloaders/data_utils.py create mode 100644 medicalai/chief/dataloaders/dataset_processors.py create mode 100644 medicalai/chief/dataloaders/dataset_visualize.py create mode 100644 medicalai/chief/dataloaders/image_sequences.py create mode 100644 medicalai/chief/dataloaders/tf_image_pipelines.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 9c1d72b..7a73a41 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,2 @@ { - "python.pythonPath": "C:\\Python35\\python3.exe" } \ No newline at end of file diff --git a/medicalai/__about__.py b/medicalai/__about__.py index d1704e9..3ce4537 100644 --- a/medicalai/__about__.py +++ b/medicalai/__about__.py @@ -14,5 +14,5 @@ __project__ = "medicalai" __author__ = "Vinayaka Jyothi - For AiBharata" -__version__ = "1.1.59" +__version__ = "1.2.2-rc" __license__ = "Apache" \ No newline at end of file diff --git a/medicalai/chief/__init__.py b/medicalai/chief/__init__.py index 03418cc..286cd4b 100644 --- a/medicalai/chief/__init__.py +++ b/medicalai/chief/__init__.py @@ -22,4 +22,6 @@ from . import nnets from . import model_metrics from . import xai -from . import dataset_analysis as dataAnalyzer \ No newline at end of file +from . import dataset_analysis as dataAnalyzer +from . import callbacks as callbacks +from . import dataloaders as dataloader \ No newline at end of file diff --git a/medicalai/chief/callbacks/__init__.py b/medicalai/chief/callbacks/__init__.py new file mode 100644 index 0000000..d86af10 --- /dev/null +++ b/medicalai/chief/callbacks/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2020-2022 AIBharata Emerging Technologies Pvt. Ltd. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +from .custom_callbacks import * \ No newline at end of file diff --git a/medicalai/chief/callbacks/custom_callbacks.py b/medicalai/chief/callbacks/custom_callbacks.py new file mode 100644 index 0000000..01faaae --- /dev/null +++ b/medicalai/chief/callbacks/custom_callbacks.py @@ -0,0 +1,109 @@ +from __future__ import absolute_import +from tensorflow.keras.callbacks import Callback +from sklearn.metrics import roc_auc_score + +class AUROC_Callback(Callback): + def __init__(self, generator, workers=1): + super().__init__() + self.generator = generator + self.workers = workers + + def on_epoch_end(self, epoch, logs=None): + y_pred = self.model.predict(self.generator, workers=self.workers) + y_true= self.generator.labels + meanAUROC = roc_auc_score(y_true,y_pred) + print(' - mAUROC:', meanAUROC) + +class MultipleClassAUROC(Callback): + ''' + Sample Usage: + auroc = MultipleClassAUROC( + sequence=validation_sequence, + class_names=class_names, + weights_path=output_weights_path, + stats=training_stats, + workers=generator_workers, + ) + ''' + def __init__(self, sequence, class_names, weights_path, stats=None, workers=1): + super(Callback, self).__init__() + self.sequence = sequence + self.workers = workers + self.class_names = class_names + self.weights_path = weights_path + self.best_weights_path = os.path.join( + os.path.split(weights_path)[0], + "best_{}".format(os.path.split(weights_path)[1]), + ) + self.best_auroc_log_path = os.path.join( + os.path.split(weights_path)[0], + "best_auroc.log", + ) + self.stats_output_path = os.path.join( + os.path.split(weights_path)[0], + ".training_stats.json" + ) + + # for resuming previous training + if stats: + self.stats = stats + else: + self.stats = {"best_mean_auroc": 0} + + # aurocs log + self.aurocs = {} + for c in self.class_names: + self.aurocs[c] = [] + + def on_epoch_end(self, epoch, logs={}): + """ + Calculate the average AUROC and save the best model weights according + to this metric. + + """ + print("\n*********************************") + self.stats["lr"] = float(kb.eval(self.model.optimizer.lr)) + print("current learning rate: {}".format(self.stats['lr'])) + + """ + y_hat shape: (#samples, len(class_names)) + y: [(#samples, 1), (#samples, 1) ... (#samples, 1)] + """ + y_hat = self.model.predict(self.sequence, workers=self.workers) + y = self.sequence.get_y_true() + + print("*** epoch#{} dev auroc ***".format(epoch + 1)) + current_auroc = [] + for i in range(len(self.class_names)): + try: + score = roc_auc_score(y[:, i], y_hat[:, i]) + except ValueError: + score = 0 + self.aurocs[self.class_names[i]].append(score) + current_auroc.append(score) + print("{}. {}: {}".foramt(i+1,self.class_names[i],score)) + print("*********************************") + + # customize your multiple class metrics here + mean_auroc = np.mean(current_auroc) + print("mean auroc: {}".format(mean_auroc)) + if mean_auroc > self.stats["best_mean_auroc"]: + print("update best auroc from {} to {}".format(self.stats['best_mean_auroc'],mean_auroc)) + + # 1. copy best model + shutil.copy(self.weights_path, self.best_weights_path) + + # 2. update log file + print("update log file: {}".format(self.best_auroc_log_path)) + with open(self.best_auroc_log_path, "a") as f: + f.write("(epoch#{}) auroc: {}, lr: {}\n".format(epoch + 1,mean_auroc,self.stats['lr'])) + + # 3. write stats output, this is used for resuming the training + with open(self.stats_output_path, 'w') as f: + json.dump(self.stats, f) + + print("update model file: {} -> {}".format(self.weights_path, self.best_weights_path)) + self.stats["best_mean_auroc"] = mean_auroc + print("*********************************") + return + diff --git a/medicalai/chief/core.py b/medicalai/chief/core.py index ee88ba1..1385c75 100644 --- a/medicalai/chief/core.py +++ b/medicalai/chief/core.py @@ -35,10 +35,13 @@ from .model_metrics import * from .xai import * from .uFuncs import * +from albumentations import Compose +import albumentations.augmentations.transforms as augmentations physical_devices = tf.config.list_physical_devices('GPU') if len(physical_devices)>1: MULTI_GPU_MODE= True + print('[INFO]: Medicalai activated with MultiGPU Mode') else: MULTI_GPU_MODE= False GPU_to_Use = 'all' @@ -196,7 +199,7 @@ def train( model, x_train, class_weights = None, saveBestModel = False, bestModelCond = None, validation_data = None, TRAIN_STEPS = None, TEST_STEPS = None, - verbose=None, y_train=None, + verbose=None, y_train=None, workers = 1 ): if callbacks is not None: if ('tensorboard'in callbacks): @@ -224,7 +227,8 @@ def train( model, x_train, epochs=epochs, validation_data=validation_data, callbacks=callbacks, - class_weight = class_weights + class_weight = class_weights, + workers =workers ) else: result = model.fit(x_train, @@ -233,7 +237,8 @@ def train( model, x_train, validation_data=validation_data, callbacks=callbacks, verbose = verbose, - class_weight = class_weights + class_weight = class_weights, + workers = workers ) return result.history @@ -492,7 +497,7 @@ def preprocessor_from_meta(self, metaFile=None): self.labelNames = self.preProcessor.labels #@timeit - def predict(self, input, verbose=0): + def predict(self, input, verbose=1, safe=False , workers= 1): """ Peform prediction on Input. Input can be Numpy Array or Image or Data Generator (in case of Test/Validation). @@ -519,24 +524,34 @@ def predict(self, input, verbose=0): # Returns Numpy.Array: of Predictions. Shape of Output [Number of Inputs, Number of Output Classes in Model] """ - if hasattr(input, 'generator') and hasattr(input, 'STEP_SIZE'): - return self.model.predict(input.generator, verbose=1) - elif hasattr(input, 'image_data_generator'): - return self.model.predict(input, verbose=1) - elif hasattr(input, 'data') and not isinstance(input,np.ndarray): - return self.model.predict(input.data, verbose=verbose) + if hasattr(self, 'workers'): + workers = self.workers else: - if self.preProcessor is not None: - input = self.preProcessor.processImage(input) - return self.model.predict(input, verbose=verbose) + workers = workers + if safe: + if hasattr(input, 'generator') and hasattr(input, 'STEP_SIZE'): + return self.model.predict(input.generator, steps=input.STEP_SIZE, verbose=1, workers=workers) + elif hasattr(input, 'image_data_generator'): + return self.model.predict(input, steps =(input.n/input.batch_size), verbose=1, workers=workers) + else: + if hasattr(input, 'generator') and hasattr(input, 'STEP_SIZE'): + return self.model.predict(input.generator, verbose=1, workers=workers) + elif hasattr(input, 'image_data_generator'): + return self.model.predict(input, verbose=1, workers=workers) + elif hasattr(input, 'data') and not isinstance(input,np.ndarray): + return self.model.predict(input.data, verbose=verbose, workers=workers) else: - if self.labelNames is None: - if hasattr(input, 'labelNames'): - self.labelNames = input.labelNames if self.labelNames is None else self.labelNames - if isinstance(input,np.ndarray): - return self.model.predict(input, verbose=verbose) + if self.preProcessor is not None: + input = self.preProcessor.processImage(input) + return self.model.predict(input, verbose=verbose, workers=workers) else: - return self.model.predict(input, verbose=verbose) + if self.labelNames is None: + if hasattr(input, 'labelNames'): + self.labelNames = input.labelNames if self.labelNames is None else self.labelNames + if isinstance(input,np.ndarray): + return self.model.predict(input, verbose=verbose, workers=workers) + else: + return self.model.predict(input, verbose=verbose, workers=workers) #@timeit def predict_pipeline(self, input): @@ -670,7 +685,7 @@ def summary(self): """ return self.model.summary() - def generate_evaluation_report(self, testSet = None, predictions = None, printStat = False,returnPlot = False, showPlot= False, pdfName =None, **kwargs): + def generate_evaluation_report(self, testSet = None, predictions = None, printStat = True,returnPlot = False, showPlot= False, pdfName =None, **kwargs): """ Generate a comprehensive PDF report with model sensitivity, specificity, accuracy, confidence intervals, ROC Curve Plot, Precision Recall Curve Plot, and Confusion Matrix Plot for each class. @@ -800,12 +815,12 @@ class TRAIN_ENGINE(INFERENCE_ENGINE): """ def __init__(self, modelName=None): super().__init__(modelName) - + def train_and_save_model(self,AI_NAME, MODEL_SAVE_NAME, trainSet, testSet, OUTPUT_CLASSES, RETRAIN_MODEL, EPOCHS, BATCH_SIZE=32, LEARNING_RATE=0.0001, convLayers=None,SAVE_BEST_MODEL=False, BEST_MODEL_COND=None, callbacks=None, loss = 'sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.0001), - metrics = ['accuracy'], showModel = False, - CLASS_WEIGHTS=None, **kwargs): + metrics = ['accuracy'], showModel = False, workers = 1, + CLASS_WEIGHTS=None, **kwargs,): """" Main function that trains and saves a model. This automatically builds new model for given networks/AI or reload existing AI model. This function can be used to retrain existing models or create new models. @@ -880,9 +895,11 @@ def train_and_save_model(self,AI_NAME, MODEL_SAVE_NAME, trainSet, testSet, OUTPU None: On successful completion saves the trained model. """ + self.workers = workers self.testSet = testSet self.modelName = MODEL_SAVE_NAME self.test_predictions = None + global MULTI_GPU_MODE, GPU_to_Use if hasattr(trainSet, 'data'): self.labelNames = trainSet.labelNames if MULTI_GPU_MODE and GPU_to_Use.lower()=='all': @@ -890,7 +907,7 @@ def train_and_save_model(self,AI_NAME, MODEL_SAVE_NAME, trainSet, testSet, OUTPU with mirrored_strategy.scope(): self.model = modelManager(AI_NAME= AI_NAME, convLayers= convLayers, modelName = MODEL_SAVE_NAME, x_train = trainSet.data, OUTPUT_CLASSES = OUTPUT_CLASSES, RETRAIN_MODEL= RETRAIN_MODEL) self.model.compile(optimizer=optimizer,loss=loss,metrics=metrics) - BATCH_SIZE *= mirrored_strategy.num_replicas_in_sync + #BATCH_SIZE *= mirrored_strategy.num_replicas_in_sync else: self.model = modelManager(AI_NAME= AI_NAME, convLayers= convLayers, modelName = MODEL_SAVE_NAME, x_train = trainSet.data, OUTPUT_CLASSES = OUTPUT_CLASSES, RETRAIN_MODEL= RETRAIN_MODEL) self.model.compile(optimizer=optimizer,loss=loss,metrics=metrics) @@ -898,7 +915,7 @@ def train_and_save_model(self,AI_NAME, MODEL_SAVE_NAME, trainSet, testSet, OUTPU print('[INFO]: BATCH_SIZE -',BATCH_SIZE) self.result = train(self.model, trainSet.data, y_train= trainSet.labels, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(testSet.data, testSet.labels), callbacks=callbacks, saveBestModel= SAVE_BEST_MODEL, - bestModelCond = BEST_MODEL_COND, TRAIN_STEPS = None, TEST_STEPS = None, + bestModelCond = BEST_MODEL_COND, TRAIN_STEPS = None, TEST_STEPS = None, workers = self.workers, class_weights=CLASS_WEIGHTS)#['tensorboard']) #self.model.evaluate(testSet.data, testSet.labels) @@ -906,25 +923,40 @@ def train_and_save_model(self,AI_NAME, MODEL_SAVE_NAME, trainSet, testSet, OUTPU rescale =None, network_input_dim =trainSet.network_input_dim, samplingMethodName=trainSet.samplingMethodName, outputName= MODEL_SAVE_NAME) else: - networkDim = np.zeros((1,)+trainSet.generator.image_shape) + from tensorflow.python.data.ops.dataset_ops import PrefetchDataset + if isinstance(trainSet.generator, PrefetchDataset): + for f,l in trainSet.generator.take(1): + inpSize = f.numpy().shape + networkDim = np.zeros((1,)+inpSize[1:]) + networkInputSize = inpSize[1:] + rescaleValue = 1./255 + else: + networkDim = np.zeros((1,)+trainSet.generator.image_shape) + networkInputSize = trainSet.generator.image_shape + try: + rescaleValue = trainSet.generator.image_data_generator.rescale + except: + rescaleValue = 1./255 + self.labelNames = dataprc.safe_labelmap_converter(trainSet.labelMap) if MULTI_GPU_MODE and GPU_to_Use.lower()=='all': mirrored_strategy = tf.distribute.MirroredStrategy() with mirrored_strategy.scope(): - mirrored_strategy = tf.distribute.MirroredStrategy() self.model = modelManager(AI_NAME= AI_NAME, modelName = MODEL_SAVE_NAME, x_train = networkDim, OUTPUT_CLASSES = OUTPUT_CLASSES, RETRAIN_MODEL= RETRAIN_MODEL, **kwargs) self.model.compile(optimizer=optimizer,loss=loss,metrics=metrics) - BATCH_SIZE *= mirrored_strategy.num_replicas_in_sync else: self.model = modelManager(AI_NAME= AI_NAME, modelName = MODEL_SAVE_NAME, x_train = networkDim, OUTPUT_CLASSES = OUTPUT_CLASSES, RETRAIN_MODEL= RETRAIN_MODEL, **kwargs) self.model.compile(optimizer=optimizer,loss=loss,metrics=metrics) print(self.model.summary()) if showModel else None print('[INFO]: BATCH_SIZE -',BATCH_SIZE) - self.result = train(self.model, trainSet.generator, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=testSet.generator, callbacks=callbacks, saveBestModel= SAVE_BEST_MODEL, bestModelCond = BEST_MODEL_COND, TRAIN_STEPS = trainSet.STEP_SIZE, TEST_STEPS = testSet.STEP_SIZE, verbose=1,class_weights=CLASS_WEIGHTS)#['tensorboard']) + self.result = train(self.model, trainSet.generator, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=testSet.generator, + callbacks=callbacks, saveBestModel= SAVE_BEST_MODEL, bestModelCond = BEST_MODEL_COND, TRAIN_STEPS = trainSet.STEP_SIZE, + TEST_STEPS = testSet.STEP_SIZE, verbose=1,class_weights=CLASS_WEIGHTS, workers = self.workers + ) #self.model.evaluate(testSet.generator,steps = testSet.STEP_SIZE) dataprc.metaSaver(trainSet.labelMap, self.labelNames, normalize= None, - rescale = trainSet.generator.image_data_generator.rescale, - network_input_dim =trainSet.generator.image_shape, samplingMethodName=None, outputName= MODEL_SAVE_NAME) + rescale = rescaleValue, + network_input_dim =networkInputSize, samplingMethodName=None, outputName= MODEL_SAVE_NAME) save_model_and_weights(self.model, outputName= MODEL_SAVE_NAME) diff --git a/medicalai/chief/dataloaders/__init__.py b/medicalai/chief/dataloaders/__init__.py new file mode 100644 index 0000000..df0c3f9 --- /dev/null +++ b/medicalai/chief/dataloaders/__init__.py @@ -0,0 +1,20 @@ +# Copyright 2020-2022 AIBharata Emerging Technologies Pvt. Ltd. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +from .data_utils import * +from .dataset_processors import * +from .image_sequences import * +from .tf_image_pipelines import * \ No newline at end of file diff --git a/medicalai/chief/dataloaders/data_utils.py b/medicalai/chief/dataloaders/data_utils.py new file mode 100644 index 0000000..642e1c6 --- /dev/null +++ b/medicalai/chief/dataloaders/data_utils.py @@ -0,0 +1,30 @@ +# Copyright 2020-2022 AIBharata Emerging Technologies Pvt. Ltd. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import + +def safe_labelmap_converter(labelMap): + labs = [0 for x in list(labelMap.keys())] + for k,v in labelMap.items(): + labs[v]=k + return labs + +def safe_label_to_labelmap_converter(labels): + labelMap = {} + for x in range(0,len(labels)): + labelMap[labels[x]]=x + return labelMap + +class myDict(dict): + pass \ No newline at end of file diff --git a/medicalai/chief/dataloaders/dataset_processors.py b/medicalai/chief/dataloaders/dataset_processors.py new file mode 100644 index 0000000..1aa455e --- /dev/null +++ b/medicalai/chief/dataloaders/dataset_processors.py @@ -0,0 +1,31 @@ +# Copyright 2020-2022 AIBharata Emerging Technologies Pvt. Ltd. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +import pandas as pd +import numpy as np +from PIL import Image +import os +from albumentations import Compose + +class aiDataLoader(object): + """Base class for aiDataLoader. + """ + def __call__(self, targetDim, color_mode,class_mode, batch_size,shuffle,seed,augmentations, **kwargs): + return self.call(targetDim, color_mode,class_mode, batch_size,shuffle,seed,augmentations, **kwargs) + + def call(self, targetDim, color_mode,class_mode, batch_size,shuffle,seed,augmentations, **kwargs): + raise NotImplementedError() + + def __str__(self): + return self.__class__.__name__ \ No newline at end of file diff --git a/medicalai/chief/dataloaders/dataset_visualize.py b/medicalai/chief/dataloaders/dataset_visualize.py new file mode 100644 index 0000000..bff60f6 --- /dev/null +++ b/medicalai/chief/dataloaders/dataset_visualize.py @@ -0,0 +1,99 @@ +# Copyright 2020-2022 AIBharata Emerging Technologies Pvt. Ltd. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +import matplotlib.pyplot as plt +import seaborn as sns +import matplotlib.cm as cm + +class dataSetStats(object): + def __init__(self, dataFrame): + super().__init__() + self.dataFrame = dataFrame + self.positve_class_count = dataFrame.loc[1,:] + self.negative_class_count = dataFrame.loc[0,:] + + @staticmethod + def _plot_hbar_df(pos, ax, figsize=(8,8),color='slateblue', + title = "Title", xaxis_title = "X_AXIS", label_y_offset= 0.45, + label_fontsize = 15): + offsetFactor=len(pos.shape) + maxVal = pos.max() if len(pos.shape)==1 else pos.max().max() + pos.plot(kind='barh', figsize=figsize, color=color, alpha=0.5, ax = ax) + ax.set_title(title, fontsize=18) + ax.set_xlabel(xaxis_title, fontsize=18) + for i in ax.patches: + if maxVal - i.get_width()> maxVal*(1/10): + ax.text(i.get_width()*(1+.03), i.get_y()+label_y_offset/offsetFactor, + str(i.get_width()), fontsize=label_fontsize-offsetFactor, + color='blue') + else: + ax.text(i.get_width()-maxVal/10, i.get_y()+label_y_offset/offsetFactor, + str(i.get_width()), fontsize=label_fontsize-offsetFactor, + color='black') + + ax.invert_yaxis() + return ax + + def plot_samples_count(self, figsize=(8,8),color='red', + title = "Class Distribution of Dataset", + xaxis_title = "No. of Samples per Class", + label_y_offset= 0.45, label_fontsize = 15): + fig, ax = plt.subplots() + ax = self._plot_hbar_df(self.positve_class_count, ax,figsize=figsize,color=color, + title = title, xaxis_title = xaxis_title, label_y_offset=label_y_offset, + label_fontsize=label_fontsize) + plt.tight_layout(pad=2) + return fig + + def plot_samples_count(self, figsize=(8,8),color='red', showPlot= True, + title = "Class Count of Dataset", + xaxis_title = "No. of Samples per Class", + label_y_offset= 0.45, label_fontsize = 15): + fig, ax = plt.subplots() + ax = self._plot_hbar_df(self.positve_class_count, ax,figsize=figsize,color=color, + title = title, xaxis_title = xaxis_title, label_y_offset=label_y_offset, + label_fontsize=label_fontsize) + plt.tight_layout(pad=2) + if showPlot: + plt.show() + return fig + + def plot_dataset_distribution(self,figsize=(8,10),color=['red', 'dodgerblue'], + title = "Class Frequencies of Dataset", showPlot= True, + xaxis_title = "Negative vs Positive Sample Distribution", + label_y_offset= 0.45, label_fontsize = 15): + meanDF = self.dataFrame.copy().T + + meanDF['total']=meanDF.sum(axis=1) + for x in self.dataFrame.index.values.tolist(): + meanDF[x] = meanDF[x]/meanDF['total'] + meanDF = meanDF.drop(['total'], axis = 1) + meanDF = meanDF.apply(lambda x:round(x,3)) + meanDF.rename(columns ={0:'Negative',1:'Positive'}, inplace=True) + self.sample_freq_df = meanDF + fig, ax = plt.subplots() + ax = self._plot_hbar_df(meanDF, ax,figsize=figsize,color=color, + title = title, xaxis_title = xaxis_title, label_y_offset=label_y_offset, + label_fontsize=label_fontsize) + plt.tight_layout(pad=2) + if showPlot: + plt.show() + return fig + + def plot_all_dataset_analysis(self, returnPlot = False, showPlot= False): + fig1 = self.plot_samples_count(showPlot=showPlot) + fig2 = self.plot_dataset_distribution(showPlot=showPlot) + if returnPlot: + return [fig1, fig2] \ No newline at end of file diff --git a/medicalai/chief/dataloaders/image_sequences.py b/medicalai/chief/dataloaders/image_sequences.py new file mode 100644 index 0000000..d6cbcd2 --- /dev/null +++ b/medicalai/chief/dataloaders/image_sequences.py @@ -0,0 +1,215 @@ +# Copyright 2020-2022 AIBharata Emerging Technologies Pvt. Ltd. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +import pandas as pd +import numpy as np +from PIL import Image +from tensorflow.keras.utils import Sequence +import os +import albumentations.augmentations.transforms as AUG +from albumentations import Compose +from .data_utils import * +from ..dataset_analysis import compute_class_freqs +from .dataset_visualize import * + +class ImageDatasetSeqFromDF(object): + def __init__(self, trainDF=None, testDF=None, valDF=None, dataFolder='', + inputCol="files", labelCols=['labels'], batch_size=16, + targetDim=(96,96),color_mode="rgb",shuffle=True, seed=21, + class_mode="raw", train_augmentations=Compose([AUG.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]), + test_val_augmentations=Compose([AUG.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]), + class_weights=None, output_range=None, + ): + self.trainDF, self.testDF, self.valDF = trainDF, testDF, valDF + self.dataFolder = dataFolder + self.inputCol, self.labelCols = inputCol, labelCols + self.batch_size = batch_size + self.targetDim, self.color_mode = targetDim, color_mode + self.shuffle, self.seed = shuffle, seed + self.class_mode = class_mode + self.train_augment = train_augmentations + self.test_val_augment = test_val_augmentations + self.trainGen = myDict() + self.testGen = myDict() + self.valGen = myDict() + self.class_weights = myDict() + self.output_range = output_range + + self.labelMap = safe_label_to_labelmap_converter(self.labelCols) + if isinstance(self.trainDF, pd.DataFrame) or isinstance(self.trainDF, str): + self.trainGen.generator = ImageSequenceFromDF(dataFrame=self.trainDF, dataFolder=dataFolder, name ='train', + inputCol=inputCol, labelCols=labelCols, batch_size=batch_size, targetDim=targetDim, seed=seed, + color_mode=color_mode,shuffle=shuffle, class_mode=class_mode, augmentations=train_augmentations, + output_range=output_range + ) + self.trainGen = self._update_params(self.trainGen) + + if isinstance(self.testDF, pd.DataFrame) or isinstance(self.testDF, str): + self.testGen.generator = ImageSequenceFromDF(dataFrame=self.testDF, dataFolder=dataFolder, name ='test', + inputCol=inputCol, labelCols=labelCols, batch_size=batch_size, targetDim=targetDim, seed=seed, + color_mode=color_mode,shuffle=False, class_mode=class_mode, augmentations=test_val_augmentations, + output_range=output_range + ) + self.testGen = self._update_params(self.testGen) + if isinstance(self.valDF, pd.DataFrame) or isinstance(self.valDF, str): + self.valGen.generator = ImageSequenceFromDF(dataFrame=self.valDF, dataFolder=dataFolder, name ='val', + inputCol=inputCol, labelCols=labelCols, batch_size=batch_size, targetDim=targetDim, seed=seed, + color_mode=color_mode,shuffle=False, class_mode=class_mode, augmentations=test_val_augmentations, + output_range=output_range + ) + self.valGen = self._update_params(self.valGen) + + def _update_params(self, thisGen): + thisGen.STEP_SIZE = thisGen.generator.STEP_SIZE + if self.class_mode in ['binary', 'sparse', 'categorical']: + thisGen.labelNames = safe_labelmap_converter(thisGen.labelMap) + elif self.class_mode in ['raw', 'multi_output']: + thisGen.labelNames = self.labelCols + thisGen.labelMap = self.labelMap + return thisGen + + def load_generator(self): + return self.trainGen, self.testGen, self.valGen + +class ImageSequenceFromDF(Sequence): + def __init__(self, dataFrame, dataFolder='', name ='train', + inputCol="files", labelCols=['labels'], batch_size=16, + targetDim=(96,96),color_mode="rgb",shuffle=True, seed=21, output_range=None, + class_mode="raw", augmentations=Compose([AUG.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))]) + ): + self.dataFolder,self.name = dataFolder, name + self.inputCol, self.labelCols = inputCol, labelCols + self.batch_size = batch_size + self.targetDim, self.color_mode = targetDim, color_mode + self.shuffle, self.seed = shuffle, seed + self.class_mode = class_mode + self.augment = augmentations + self.output_range = output_range + + self.convertCSVFile2DF(dataFrame) + self._validateDF(self.dataFrame, self.name) + self._dfConvertFilePath() + self.N = self.dataFrame.shape[0] + self.n = 0 + self.classes = labelCols + self.num_classes = len(list(labelCols)) + self.STEP_SIZE = self.__len__() + self.shuffleDataFrame = self.dataFrame + self._add_info() + self._calculate_class_weights() + self.on_epoch_end() + + def img_processor(self, image): + img = Image.open(image) + if self.color_mode.upper() == 'RGB' and img.mode != 'RGB': + img = img.convert('RGB') + elif self.color_mode.upper() == 'RGBA'and img.mode != 'RGBA': + img = img.convert('RGBA') + elif self.color_mode.upper() == 'GRAYSCALE' and img.mode != 'L': + img = img.convert('L') + img = img.resize((self.targetDim[0:2]),0) + img = np.array(img, 'uint8') + #img = np.array(img*(1./255)).astype('float32') + return img + + def load_generator(self): + return self.generator + + def _calculate_class_weights(self): + pos_freq,neg_freq = compute_class_freqs(self.labels) + wgtList=[] + for x,y in zip(pos_freq,neg_freq): + labelWgtDict = {0:x, 1:y} + wgtList.append(labelWgtDict) + self.class_weights = wgtList + + def get_class_weights(self): + return self.class_weights + + def __len__(self): + return int(np.ceil(self.dataFrame.shape[0] / float(self.batch_size))) + + def __getitem__(self, idx): + batchDF = self.shuffleDataFrame[idx * self.batch_size:(idx + 1) * self.batch_size] + batch_x = batchDF[self.inputCol].to_numpy() + batch_y = batchDF[self.labelCols].to_numpy() + if self.augment != None: + if self.output_range == None or isinstance(self.output_range, str): + return np.stack( + [self.augment(image=self.img_processor(x))["image"] for x in batch_x], + axis=0), np.array(batch_y) + else: + return np.stack( + [np.clip(self.augment(image=self.img_processor(x))["image"],self.output_range[0],self.output_range[1]) for x in batch_x], + axis=0), np.array(batch_y) + else: + return np.stack( + [self.img_processor(x) for x in batch_x], + axis=0), np.array(batch_y) + + def __next__(self): + batch_x, batch_y = self.__getitem__(self.n) + self.n += 1 + if self.n >= self.__len__(): + self.on_epoch_end() + self.n = 0 + return batch_x, batch_y + + def on_epoch_end(self): + if self.shuffle == True: + self.shuffleDataFrame = self.dataFrame.sample(n=self.N, random_state=self.seed).reset_index(drop=True) + self.seed+=1 + + def _add_info(self): + if self.color_mode.upper() == 'RGB': + self.image_shape= self.targetDim+ (3,) + elif self.color_mode.upper() == 'RGBA': + self.image_shape= self.targetDim+ (4,) + elif self.color_mode.upper() == 'GRAYSCALE': + self.image_shape= self.targetDim+ (2,) + self.labels = self.dataFrame[self.labelCols].values + + def check_imbalanced_dataset(self): + dfC = self.dataFrame[self.labelCols].apply(pd.value_counts) + self.dataSetStats = dataSetStats(dfC) + return self.dataSetStats + + def convertCSVFile2DF(self,dataFrame): + if isinstance(dataFrame, pd.DataFrame): + self.dataFrame= dataFrame + else: + print('[INFO]: Reading CSV Files into DataFrame ', end='') + self.dataFrame = pd.read_csv(dataFrame) + print(' - Done!') + + def _validateDF(self,df, name): + inPresent = True if self.inputCol in df.columns else False + labelPresent = True if set(self.labelCols).issubset(df.columns) else False + if inPresent and labelPresent: + print('[INFO]: Dataframe {} Validation.. Success!'.format(name)) + else: + print('[ERROR]: Dataframe {} Validation.. Failure!'.format(name)) + print('[---->]: Label Validation- {} : Input Validation - {}'.format( + 'PASS' if labelPresent else 'FAIL','PASS' if inPresent else 'FAIL',)) + + def _get_sample_full_path(self, fileName): + return os.path.join(self.dataFolder, fileName) + + def _createFullInputPath(self, df): + df[self.inputCol] = df[self.inputCol].map(lambda row : self._get_sample_full_path(row)) + return df + + def _dfConvertFilePath(self): + self.dataFrame= self._createFullInputPath(self.dataFrame) + diff --git a/medicalai/chief/dataloaders/tf_image_pipelines.py b/medicalai/chief/dataloaders/tf_image_pipelines.py new file mode 100644 index 0000000..85c02e1 --- /dev/null +++ b/medicalai/chief/dataloaders/tf_image_pipelines.py @@ -0,0 +1,162 @@ +# Copyright 2020-2022 AIBharata Emerging Technologies Pvt. Ltd. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +import pandas as pd +import numpy as np +from PIL import Image +from tensorflow.keras.utils import Sequence +import os +import albumentations.augmentations.transforms as AUG +from albumentations import Compose +from .data_utils import * +from ..dataset_analysis import compute_class_freqs +from .dataset_visualize import * + +import tensorflow as tf + + +AUTOTUNE = tf.data.experimental.AUTOTUNE +class ImagePipelineFromDF(object): + def __init__(self, dataFrame, dataFolder='', + inputCol="files", labelCols=['labels'], batch_size=16, + targetDim=(96,96),augmentations=None, color_mode="rgb", + class_mode="raw", shuffle=True, seed=21, normalize = True, + shuffle_buffer_size = 1000, cache= False, prefetch = True, + name = 'train' + ): + self.convertCSVFile2DF(dataFrame) + self.dataFolder = dataFolder + self.inputCol, self.labelCols = inputCol, labelCols + self.BATCH_SIZE = batch_size + self.targetDim = targetDim + self.augmentations = augmentations + self.color_mode = color_mode + self.class_mode = class_mode + self.shuffle_buffer_size, self.shuffle = shuffle_buffer_size, shuffle + self.normalize = normalize + self.seed = seed + self.prefetch,self.cache = prefetch,cache + + self._validateDF(self.dataFrame, name) + self._dfConvertFilePath() + self.N = self.dataFrame.shape[0] + self.n = 0 + self.classes = labelCols + self.num_classes = len(list(labelCols)) + self.STEP_SIZE = self.__len__() + self.list_ds =tf.data.Dataset.from_tensor_slices((self.dataFrame[self.inputCol].values, + self.dataFrame[self.labelCols].values)) + self.process_dataset() + + if name.lower=='train': + repeat = True + augment = True + else: + repeat = False + augment = False + + self.generator = self.prepare_for_training(self.labeled_ds, repeat =repeat, augment = augment) + + def tfDataset(self): + return self.generator + + @tf.function + def decode_img(self, img): + img = tf.image.decode_jpeg(img, channels=3) + if self.normalize: + img = tf.image.convert_image_dtype(img, tf.float32) + return tf.image.resize(img, list(self.targetDim[0:2])) + + def process_path(self,file_path, labels): + label = labels #self.get_label(file_path) + img = tf.io.read_file(file_path) + img = self.decode_img(img) + #img = self.augment(img,label) + return img, label + + def process_dataset(self): + self.labeled_ds = self.list_ds.map(self.process_path, num_parallel_calls=AUTOTUNE) + + def prepare_for_training(self, ds, repeat =True, augment = True): + """ + - If using is a small dataset, only load it once, and keep it in memory. + - use `.cache(filename)` to cache preprocessing work for datasets that don't fit in memory. + """ + if self.cache: + if isinstance(self.cache, str): + ds = ds.cache(self.cache) + else: + ds = ds.cache() + + if self.shuffle: + ds = ds.shuffle(buffer_size=self.shuffle_buffer_size) + + if repeat: + ds = ds.repeat() + + ds = ds.batch(self.BATCH_SIZE) + if augment: + ds = ds.map(self.augment,num_parallel_calls=AUTOTUNE) + + if self.prefetch: + ds = ds.prefetch(buffer_size=AUTOTUNE) + + return ds + + #@tf.function + def augment(self, image,label): + image = tf.numpy_function(func=self.custom_augment, inp=[image], Tout=tf.float32) + image = tf.clip_by_value(image, 0, 1) + return image, label + + #@tf.function + def custom_augment(self,image): + image = np.uint8((image)*255) + images = np.stack([self.augmentations(image=x)["image"] for x in image], axis=0) + return images + + def load_generator(self): + return self.generator + + def __len__(self): + return int(np.ceil(self.dataFrame.shape[0] / float(self.BATCH_SIZE))) + + def convertCSVFile2DF(self,dataFrame): + if isinstance(dataFrame, pd.DataFrame): + self.dataFrame= dataFrame + else: + print('[INFO]: Reading CSV Files into DataFrame ', end='') + self.dataFrame = pd.read_csv(dataFrame) + print(' - Done!') + + def _validateDF(self,df, name): + inPresent = True if self.inputCol in df.columns else False + labelPresent = True if set(self.labelCols).issubset(df.columns) else False + if inPresent and labelPresent: + print('[INFO]: Dataframe {} Validation.. Success!'.format(name)) + else: + print('[ERROR]: Dataframe {} Validation.. Failure!'.format(name)) + print('[---->]: Label Validation- {} : Input Validation - {}'.format( + 'PASS' if labelPresent else 'FAIL','PASS' if inPresent else 'FAIL',)) + + def _get_sample_full_path(self, fileName): + return os.path.join(self.dataFolder, fileName) + + def _createFullInputPath(self, df): + df[self.inputCol] = df[self.inputCol].apply(lambda row : self._get_sample_full_path(row)) + return df + + def _dfConvertFilePath(self): + self.dataFrame= self._createFullInputPath(self.dataFrame) \ No newline at end of file diff --git a/medicalai/chief/dataset_prepare.py b/medicalai/chief/dataset_prepare.py index 91018b2..fab679e 100644 --- a/medicalai/chief/dataset_prepare.py +++ b/medicalai/chief/dataset_prepare.py @@ -103,7 +103,7 @@ def __init__(self, targetDim=(31,31), samplingMethod=None,normalize=False, color self.dtype = dtype self.color_mode = color_mode self.samplingMethodName = 'nearest' if samplingMethod==None else samplingMethod - if samplingMethod: + if samplingMethod and isinstance(samplingMethod,str): if samplingMethod.lower()=='box': self.samplingMethod =Image.BOX elif samplingMethod.lower()=='nearest': @@ -118,6 +118,8 @@ def __init__(self, targetDim=(31,31), samplingMethod=None,normalize=False, color self.samplingMethod =Image.LANCZOS else: self.samplingMethod =Image.NEAREST + elif samplingMethod and isinstance(samplingMethod,int): + self.samplingMethod = samplingMethod else: self.samplingMethod =Image.NEAREST @@ -166,22 +168,39 @@ def resizeDataSetfromFolder(self,folder): def processImage(self,image, targetFolder=None): if isinstance(image,np.ndarray): - if len(image.shape)>3: - image = np.squeeze(image,0) - img = Image.fromarray(np.uint8((image)*255)) + if image.shape!=self.output_size: + if len(image.shape)>3: + image = np.squeeze(image,0) + try: + img = Image.fromarray(image) + except: + img = Image.fromarray(np.uint8((image)*255.0)) + else: + if image.dtype !='uint8': + return np.expand_dims(image, axis=0) + else: + if self.normalize: + image = image*(1./255) + if self.rescale is not None: + image = image*self.rescale + return np.expand_dims(image, axis=0) else: try: sep = os.path.splitext(image) fileExt = sep[-1] fileName = sep[0] except: - if targetFolder is not None: - head, tail = os.path.split(image.path) - sep = os.path.splitext(tail) - else: - sep = os.path.splitext(image.path) - fileExt = sep[-1] - fileName = sep[0] + try: + if targetFolder is not None: + head, tail = os.path.split(image.path) + sep = os.path.splitext(tail) + else: + sep = os.path.splitext(image.path) + fileExt = sep[-1] + fileName = sep[0] + except: + #print('[ERROR]: Failed to Aquiring File path Name ') + fileExt = 'CANT GET IT' if fileExt== '.dcm': with dicomProcessor.dcmread(image) as ds: img = ds.pixel_array @@ -429,6 +448,12 @@ def safe_labelmap_converter(labelMap): labs[v]=k return labs +def safe_label_to_labelmap_converter(labels): + labelMap = {} + for x in range(0,len(labels)): + labelMap[labels[x]]=x + return labelMap + class datasetGenFromFolder(object): """ Create a dataset generator from dataset present in Folder. @@ -638,7 +663,139 @@ def get_numpy(self, generator): ############################################################################################# +############################################################################################# +#Data from data frame implementation +class datasetGenFromDF(object): + """Creates Keras Dataset Generator for Handling Large Datasets from DataFrame. + + # Arguments + csv_path: folder containing train.csv and test.csv. + folder: The directory must be set to the path where your training images are present. + x_col: Name of column containing image name, `default = name`. + y_col: Name of column for labels, `default = labels`. + targetDim: The target_size is the size of your input images to the neural network. + class_mode: Set `binary` if classifying only two classes, if not set to `categorical`, in case of an Autoencoder system, both input and the output would probably be the same image, for this case set to `input`. + color_mode: `grayscale` for black and white or grayscale, `rgb` for three color channels. + batch_size: Number of images to be yielded from the generator per batch. If training fails lower this number. + augmentation: : [Optional] : `Default = True`: Perform augmentation on Dataset + shuffle: : [Optional] : `Default = True`: Shuffle Dataset + seed: : [Optional] : `Default = 23`: Initialize Random Seed + + # Returns + None: Initializes Test and Train Data Generators + """ + + def __init__(self, folder, trainDF=None, testDF=None, x_col = "name", y_col = "labels", targetDim=(224,224), normalize=False , batch_size = 16, augmentation = True, + color_mode="rgb", class_mode="sparse", shuffle=True, seed=17): + if isinstance(trainDF,str): + self.trainDF = pd.read_csv(trainDF) + print("[INFO]: Succesfully read Train Dataframe from CSV" ) + else: + self.trainDF = trainDF + print("[INFO]: Succesfully read Train Dataframe" ) + + if isinstance(testDF,str): + self.testDF = pd.read_csv(testDF) + print("[INFO]: Succesfully read Test Dataframe from CSV" ) + else: + self.testDF = testDF + print("[INFO]: Succesfully read Test Dataframe" ) + + self.folder = folder + self.x_col = x_col + self.y_col = y_col + self.targetDim = targetDim + self.normalize = normalize + self.batch_size = batch_size + + self.color_mode = color_mode + self.class_mode = class_mode + self.seed = seed + self.shuffle = shuffle + self.trainGen = myDict() + self.testGen = myDict() + self.class_weights = myDict() + + if isinstance(augmentation, AUGMENTATION): + self.augmentation = augmentation + self.augmentation.testAug = tf.keras.preprocessing.image.ImageDataGenerator() + else: + if augmentation==True or augmentation=='True' or augmentation=='Default': + self.augmentation = AUGMENTATION() + else: + self.augmentation = myDict() + self.augmentation.trainAug = tf.keras.preprocessing.image.ImageDataGenerator() + self.augmentation.testAug = tf.keras.preprocessing.image.ImageDataGenerator() + + + + print("[INFO]: Gathering images for generator") + self.trainGen.generator = self.augmentation.trainAug.flow_from_dataframe( + dataframe = self.trainDF, + directory=self.folder, + x_col = self.x_col, + y_col = self.y_col, + target_size=targetDim, + batch_size=self.batch_size, + color_mode = self.color_mode, + class_mode = self.class_mode, + seed = self.seed, + shuffle = self.shuffle, + validate_filenames = False + ) + + self.testGen.generator = self.augmentation.testAug.flow_from_dataframe( + dataframe = self.testDF, + directory=self.folder, + x_col = self.x_col, + y_col = self.y_col, + target_size=targetDim, + batch_size=self.batch_size, + color_mode = self.color_mode, + class_mode = self.class_mode, + validate_filenames = False, + shuffle = False + ) + self.trainGen.STEP_SIZE= np.ceil(self.trainGen.generator.n/self.trainGen.generator.batch_size) + self.testGen.STEP_SIZE= np.ceil(self.testGen.generator.n/self.testGen.generator.batch_size) + + if class_mode in ['binary', 'sparse', 'categorical']: + self.labelMap = self.trainGen.generator.class_indices + self.trainGen.labelNames = safe_labelmap_converter(self.trainGen.labelMap) + self.testGen.labelNames = safe_labelmap_converter(self.testGen.labelMap) + elif class_mode in ['raw', 'multi_output']: + self.trainGen.labelNames = self.y_col + self.testGen.labelNames = self.y_col + self.labelMap = safe_label_to_labelmap_converter(self.y_col) + + self.trainGen.labelMap = self.labelMap + self.testGen.labelMap = self.labelMap + if len(np.asarray(self.trainGen.generator.labels).shape)==1: + print("[INFO]: Converting labes to one_hot_labels") + self.trainGen.generator.one_hot_labels = to_categorical(self.trainGen.generator.labels) + self.testGen.generator.one_hot_labels = to_categorical(self.testGen.generator.labels) + else: + self.trainGen.generator.one_hot_labels = self.trainGen.generator.labels + self.testGen.generator.one_hot_labels = self.testGen.generator.labels + + def load_generator(self): + return self.trainGen, self.testGen + + def get_class_weights(self): + self.pos = np.sum(np.array(self.trainGen.generator.one_hot_labels)==1, axis=0) + for i in range(len(self.pos)): + self.class_weights[i]=(np.sum(self.pos))/(len(self.pos)*self.pos[i]) + return self.class_weights + + def get_numpy(self, generator): + prevBSize =generator.generator.batch_size + generator.generator.batch_size = generator.generator.samples + dataset_as_tuple = next(generator.generator) + generator.generator.batch_size = prevBSize + return dataset_as_tuple + +############################################################################################# if __name__ == "__main__": - mainFolder = "chest-xray-pnumonia-covid19" + mainFolder = "test" v = datasetManager(mainFolder,targetDim=(96,96), normalize=False) diff --git a/medicalai/chief/model_metrics/modelstats.py b/medicalai/chief/model_metrics/modelstats.py index 65dfdbb..91a9662 100644 --- a/medicalai/chief/model_metrics/modelstats.py +++ b/medicalai/chief/model_metrics/modelstats.py @@ -176,7 +176,7 @@ def get_specificity(expected, preds, threshold=0.5): specificity = TN/(TN + FP) return specificity -def get_ppv(expected, preds, threshold=0.5): +def get_ppv(expected, preds, threshold=0.5, className=None): """ Compute PPV of predictions at threshold. @@ -193,7 +193,11 @@ def get_ppv(expected, preds, threshold=0.5): PPV = 0.0 TP = true_positives(expected, preds, threshold) FP = false_positives(expected, preds, threshold) - PPV = TP/(TP+FP) + if TP+FP==0: + print('[WARN]: NO True and False Positives Found for', className) + PPV = 0 + else: + PPV = TP/(TP+FP) return PPV def get_npv(expected, preds, threshold=0.5): @@ -258,7 +262,7 @@ def get_curve(gt, pred, target_names, curve='roc',returnPlot = False, showPlot= ax.set_xlim([0.0, 1.0]) plt.step(recall, precision, where='post', label=label) - plt.legend(loc='best') + plt.legend(bbox_to_anchor=(1.2, 1.0, 0.3, 0.2), loc='best') #print('showPlot ',showPlot) if showPlot==True: plt.show() @@ -269,17 +273,13 @@ def get_curve(gt, pred, target_names, curve='roc',returnPlot = False, showPlot= return ax - -def get_roc_curve(labels, predicted_vals, groundTruth= None, generator=None , returnPlot = False, showPlot= True, axes=None, **kwargs): +def get_roc_curve(labels, predicted_vals, groundTruth= None, generator=None , returnPlot = True, showPlot= False, axes=None, **kwargs): + #print('get_roc_curve Args: returnPlot - {} : showPlot - {}'.format(returnPlot,showPlot) ) auc_roc_vals = [] + meanAUROC = AUROC_mean(groundTruth,predicted_vals) for i in range(len(labels)): try: - if generator is not None and groundTruth is None: - gt = generator.labels[:, i] - elif groundTruth is not None and generator is None: - gt = groundTruth[:, i] - else: - print('Wrong Configuration: Only groundTruth or generator can be set- Not BOTH') + gt = groundTruth[:, i] pred = predicted_vals[:, i] auc_roc = roc_auc_score(gt, pred) auc_roc_vals.append(auc_roc) @@ -289,33 +289,34 @@ def get_roc_curve(labels, predicted_vals, groundTruth= None, generator=None , re ax = plt ax.xlabel('False positive rate') ax.ylabel('True positive rate') - ax.title('ROC Curve',fontsize = 18, color='r') + ax.title('ROC Curve [Mean AUROC-{:.4f}]'.format(meanAUROC),fontsize = 18, color='r') else: ax = axes ax.set_xlabel('False positive rate') ax.set_ylabel('True positive rate') - ax.set_title('ROC Curve',fontsize = 18, color='r') + ax.set_title('ROC Curve [Mean AUROC-{:.4f}]'.format(meanAUROC),fontsize = 18, color='r') ax.plot([0, 1], [0, 1], 'k--') ax.plot(fpr_rf, tpr_rf, label=labels[i] + " (" + str(round(auc_roc, 3)) + ")") - - - ax.legend(loc='best') + ax.legend(bbox_to_anchor=(1.05, 1.0, 0.3, 0.2), loc='best') except Exception as err: print("[ERROR]: in generating ROC curve for",labels[i], '\n',err) - #print('showPlot ROC Curve',showPlot) + if showPlot==True: plt.show() if returnPlot==True: + #print('ROC Return 1') return auc_roc_vals, fig else: + #print('ROC Return 2') return auc_roc_vals else: if returnPlot==True: + #print('ROC Return 3') return auc_roc_vals, ax else: - return auc_roc_vals - + #print('ROC Return 4') + return auc_roc_vals, ax def bootstrap_auc(y, pred, classes, bootstraps = 100, fold_size = 1000): statistics = np.zeros((len(classes), bootstraps)) @@ -359,6 +360,17 @@ def get_false_pos(y, pred, th=0.5): pred_t = (pred > th) return np.sum((pred_t == True) & (y == 0)) +def _gen_proper_threshold_settings(thresholds, class_labels): + if isinstance(thresholds, float): + thresholds = [thresholds] * len(class_labels) + elif isinstance(thresholds, list): + if len(thresholds) == 0: + thresholds = [.5] * len(class_labels) + if len(thresholds) == 1 and len(thresholds) != len(class_labels): + thresholds = thresholds * len(class_labels) + thresholds = [float(x) for x in thresholds] + return thresholds + def get_performance_metrics(y, pred, class_labels, tp=get_true_pos, tn=get_true_neg, fp=get_false_pos, fn=get_false_neg, @@ -366,8 +378,7 @@ def get_performance_metrics(y, pred, class_labels, tp=get_true_pos, sens=None, ppv=None, npv=None, auc=None, f1=None, thresholds=[]): - if len(thresholds) != len(class_labels): - thresholds = [.5] * len(class_labels) + thresholds = _gen_proper_threshold_settings(thresholds, class_labels) columns = ["", "TP", "TN", "FP", "FN", "Accuracy", "Prevalence", "Sensitivity", @@ -376,13 +387,13 @@ def get_performance_metrics(y, pred, class_labels, tp=get_true_pos, for i in range(len(class_labels)): df.loc[i] = [""] + [0] * (len(columns) - 1) df.loc[i][0] = class_labels[i] - df.loc[i][1] = round(tp(y[:, i], pred[:, i]), + df.loc[i][1] = round(tp(y[:, i], pred[:, i], thresholds[i]), 3) if tp != None else "Not Defined" - df.loc[i][2] = round(tn(y[:, i], pred[:, i]), + df.loc[i][2] = round(tn(y[:, i], pred[:, i], thresholds[i]), 3) if tn != None else "Not Defined" - df.loc[i][3] = round(fp(y[:, i], pred[:, i]), + df.loc[i][3] = round(fp(y[:, i], pred[:, i], thresholds[i]), 3) if fp != None else "Not Defined" - df.loc[i][4] = round(fn(y[:, i], pred[:, i]), + df.loc[i][4] = round(fn(y[:, i], pred[:, i], thresholds[i]), 3) if fn != None else "Not Defined" df.loc[i][5] = round(acc(y[:, i], pred[:, i], thresholds[i]), 3) if acc != None else "Not Defined" @@ -392,11 +403,11 @@ def get_performance_metrics(y, pred, class_labels, tp=get_true_pos, 3) if sens != None else "Not Defined" df.loc[i][8] = round(spec(y[:, i], pred[:, i], thresholds[i]), 3) if spec != None else "Not Defined" - df.loc[i][9] = round(ppv(y[:, i], pred[:, i], thresholds[i]), + df.loc[i][9] = round(ppv(y[:, i], pred[:, i], thresholds[i], class_labels[i]), 3) if ppv != None else "Not Defined" df.loc[i][10] = round(npv(y[:, i], pred[:, i], thresholds[i]), 3) if npv != None else "Not Defined" - df.loc[i][11] = round(auc(y[:, i], pred[:, i]), + df.loc[i][11] = round(auc(y[:, i], pred[:, i] > thresholds[i]), 3) if auc != None else "Not Defined" df.loc[i][12] = round(f1(y[:, i], pred[:, i] > thresholds[i]), 3) if f1 != None else "Not Defined" @@ -413,7 +424,7 @@ def model_performance_metrics(y, pred, class_labels, tp=get_true_pos, tn=get_true_neg, fp=get_false_pos, fn=get_false_neg,acc=get_accuracy, prevalence=get_prevalence, sens=get_sensitivity, spec=get_specificity, ppv=get_ppv, npv=get_npv, auc=roc_auc_score, f1=f1_score, - thresholds=[]) + thresholds=thresholds) def confidence_intervals(class_labels, statistics): df = pd.DataFrame(columns=["Mean AUC (CI 5%-95%)"]) @@ -443,6 +454,7 @@ def plot_calibration_curve(y, pred,class_labels): from sklearn.linear_model import LogisticRegression as LR + def platt_scaling(y, pred,class_labels): y_train = y pred_train = pred @@ -454,8 +466,23 @@ def platt_scaling(y, pred,class_labels): pred_calibrated[:, i] = lr.predict_proba(pred[:, i].reshape(-1, 1))[:,1] return pred_calibrated - -def generate_evaluation_report(CLASS_NAMES, predictions, groundTruth=None, generator=None, returnPlot = True, showPlot= True , printStat=True, **kwargs): +def AUROC_mean(gt, preds): + return roc_auc_score(gt, preds) + +def AUROC_each_class(gt,preds, totalClasses, CLASS_NAMES, mean = True, prints=False): + outAUROC = [] + for i in range(totalClasses): + outAUROC.append(roc_auc_score(gt[:, i], preds[:, i])) + labeledAUROC = [] + for i in range (0, len(outAUROC)): + labeledAUROC.append('{} : {:.4f}'.format(CLASS_NAMES[i], outAUROC[i])) + if mean: + labeledAUROC.append('{} : {:.4f}'.format('MEAN AUROC', AUROC_mean(gt, preds))) + if prints: + print('\n'.join(labeledAUROC)) + return labeledAUROC + +def generate_evaluation_report(CLASS_NAMES, predictions, groundTruth=None, generator=None, returnPlot = True, showPlot= True , printStat=True, thresholds= [0.5], **kwargs): """ Generates Evaluation PDF Report for a Test/Validation experimentation. Ground truth needs to be passed to generate the pdf report. @@ -476,38 +503,60 @@ def generate_evaluation_report(CLASS_NAMES, predictions, groundTruth=None, gene """ OUTPUT_CLASSES = len(CLASS_NAMES) if groundTruth is not None and generator is None: - gt_one_hot_vec = np.identity(OUTPUT_CLASSES)[groundTruth[:,0]] - gt = groundTruth[:,0] - elif generator is not None: - gt_one_hot_vec = np.identity(OUTPUT_CLASSES)[generator.labels] - gt = generator.labels + if len(groundTruth.shape)==1 and groundTruth.shape[-1] != OUTPUT_CLASSES: + gt_one_hot_vec = np.identity(OUTPUT_CLASSES)[groundTruth[:,0]] + gt = groundTruth#[:,0] + else: + gt_one_hot_vec = groundTruth + gt = groundTruth + + elif generator is not None and groundTruth is None: + if len(generator.labels.shape)==1 and generator.labels.shape[-1] != OUTPUT_CLASSES: + gt_one_hot_vec = np.identity(OUTPUT_CLASSES)[generator.labels] + gt = generator.labels + else: + gt_one_hot_vec = generator.labels + gt = generator.labels + multiLabelClass = False + if len(gt_one_hot_vec)>1: + v= np.count_nonzero(gt_one_hot_vec, axis=1) + multiLabelClass = (v>1).any() + + if multiLabelClass: + try: + multilabel_confusion_matrix(gt_one_hot_vec[0:2], predictions[0:2]) + except: + thresholds = _gen_proper_threshold_settings(thresholds, CLASS_NAMES) + print('[WARN]: Raw Prediction Values Passed. Binarizing predictions with threshold={} to calculate confusion matrix.'.format(thresholds[0])) + predictions = (predictions > thresholds[0]).astype('int') + nrows=kwargs['nrows'] if 'nrows' in kwargs else 2 + ncols=kwargs['ncols'] if 'ncols' in kwargs else 1 + pad = kwargs['pad'] if 'pad' in kwargs else 10 + hspace = kwargs['hspace'] if 'hspace' in kwargs else 0.3 + hspace2 = kwargs['hspace2'] if 'hspace2' in kwargs else 0.3 + figSize = kwargs['figsize'] if 'figsize' in kwargs else (10,10) + modelName = kwargs['modelName'] if 'modelName' in kwargs else 'Model Name' + pdfName = kwargs['pdfName'] if 'pdfName' in kwargs and kwargs['pdfName'] is not None else modelName + if not showPlot: - nrows=kwargs['nrows'] if 'nrows' in kwargs else 2 - ncols=kwargs['ncols'] if 'ncols' in kwargs else 1 - pad = kwargs['pad'] if 'pad' in kwargs else 10 - hspace = kwargs['hspace'] if 'hspace' in kwargs else 0.3 - hspace2 = kwargs['hspace2'] if 'hspace2' in kwargs else 0.3 - figSize = kwargs['figsize'] if 'figsize' in kwargs else (10,10) - modelName = kwargs['modelName'] if 'modelName' in kwargs else 'Model Name' - pdfName = kwargs['pdfName'] if 'pdfName' in kwargs and kwargs['pdfName'] is not None else modelName fig = plt.figure(figsize=figSize) axs = fig.subplots(nrows=nrows, ncols=ncols) - fig.tight_layout(pad=pad) plt.subplots_adjust(hspace=hspace) + #plt.tight_layout() else: axs = [None for x in range(0,nrows*ncols)] auc_roc_vals, roc_plt = get_roc_curve(CLASS_NAMES,predictions,groundTruth=gt_one_hot_vec, generator=None, - returnPlot = True, showPlot= showPlot, axes=axs[0]) + returnPlot = returnPlot, showPlot= showPlot, axes=axs[0]) prefMetrics = get_performance_metrics(gt_one_hot_vec, predictions, CLASS_NAMES, acc=get_accuracy, prevalence=get_prevalence, - sens=get_sensitivity, spec=get_specificity, ppv=get_ppv, npv=get_npv, auc=roc_auc_score, f1=f1_score) - + sens=get_sensitivity, spec=get_specificity, ppv=get_ppv, npv=get_npv, auc=roc_auc_score, f1=f1_score, thresholds=thresholds) + #meanAUROC = AUROC_mean(gt_one_hot_vec,predictions) statistics = bootstrap_auc(gt_one_hot_vec, predictions, CLASS_NAMES) confInterval = confidence_intervals(CLASS_NAMES, statistics) prc_plt = get_curve(gt_one_hot_vec, predictions, CLASS_NAMES, curve='precision_recall_curve', returnPlot = False, showPlot= showPlot, axes=axs[1]) - + fig.tight_layout(pad=pad) dfSpec = prefMetrics[['Sensitivity','Specificity', 'PPV', 'NPV', 'AUC', 'F1']]#.astype(float) dfAcc= prefMetrics[['TP', 'TN', 'FP', 'FN', 'Accuracy', 'Prevalence']] @@ -519,21 +568,31 @@ def generate_evaluation_report(CLASS_NAMES, predictions, groundTruth=None, gene axs2[1] = render_df_as_table(dfAcc, ax= axs2[1], title = 'Validation Accuracy Details (Th={})'.format(prefMetrics['Threshold'][0]),header_columns=0, col_width=2.2, header_color='#DE2E81', resetIndex=True) axs2[2] = render_df_as_table(confInterval, ax= axs2[2],title = 'Confidence Interval',header_columns=0, col_width=4, header_color='#2BC4C5', resetIndex=True) - fig3 = plt.figure(figsize=(10,10)) - axs3 = fig3.subplots(nrows=1, ncols=1) - con_mat_norm_df, con_mat_df,Accuracy,cohenKappaScore = _CM_calculate(np.argmax(predictions, axis=-1),gt,CLASS_NAMES) - axs3 = _Plot_Heatmap_from_DF(con_mat_norm_df, title="Confusion Matrix Normalized", ax = axs3, Accuracy=Accuracy, cohenKappaScore=cohenKappaScore) - #axs3[1] = _Plot_Heatmap_from_DF(con_mat_df, title="Confusion Matrix", ax = axs3[1], Accuracy=Accuracy, cohenKappaScore=cohenKappaScore) - #fig3.tight_layout() + + if not multiLabelClass: + fig3 = plt.figure(figsize=(10,10)) + axs3 = fig3.subplots(nrows=1, ncols=1) + con_mat_norm_df, con_mat_df,Accuracy,cohenKappaScore = _CM_calculate(np.argmax(predictions, axis=-1),gt,CLASS_NAMES) + axs3 = _Plot_Heatmap_from_DF(con_mat_norm_df, title="Confusion Matrix Normalized", ax = axs3, Accuracy=Accuracy, cohenKappaScore=cohenKappaScore) + else: + cm_cols = 4 if OUTPUT_CLASSES>4 else OUTPUT_CLASSES + cm_rows = 2 + numPages = int(np.ceil(OUTPUT_CLASSES/(cm_cols*cm_rows))) + fig3 = plot_multilabel_confusion_matrix(gt_one_hot_vec,predictions,CLASS_NAMES,numPages=numPages,rows_per_page=cm_rows,ncols=cm_cols) + allMetrics = [prefMetrics, confInterval, auc_roc_vals] if showPlot: plots =[roc_plt, prc_plt,axs2[0], axs2[1], axs2[2], axs3[0],axs3[1] ] else: - plots =[fig2,fig, fig3] + fig3 = fig3 if isinstance(fig3, list) else [fig3] + fig2 = fig2 if isinstance(fig2, list) else [fig2] + fig = fig if isinstance(fig, list) else [fig] + plots =np.array([np.array(fig2),np.array(fig), np.array(fig3)]) + plots = np.concatenate(plots).ravel().tolist() + if printStat==True: - print(dfSpec) - print(dfAcc) - print(confInterval) + AUROC_each_class(gt_one_hot_vec,predictions,OUTPUT_CLASSES, CLASS_NAMES, mean=True,prints=True) + with PdfPages(pdfName+'_report.pdf') as pdf: for x in plots: @@ -545,12 +604,12 @@ def generate_evaluation_report(CLASS_NAMES, predictions, groundTruth=None, gene def render_df_as_table(data, title = 'Table', col_width=3.0, row_height=0.625, font_size=18, header_color='#655EE5', row_colors=['#f1f1f2', 'w'], edge_color='w', - bbox=[0, 0, 1, 1], header_columns=0, resetIndex=False, + bbox=[0, 0, 1, 1], header_columns=0, resetIndex=False, rename_cols = {'index': 'CLASSES'}, ax=None, **kwargs): if resetIndex: data = data.reset_index() try: - data =data.rename(columns={'index': 'CLASSES'}) + data =data.rename(columns=rename_cols) except: v = None if ax is None: @@ -603,8 +662,8 @@ def _CM_calculate(predictions,ground_truth,classNames): return con_mat_norm_df, con_mat_df,Accuracy,cohenKappaScore def _Plot_Heatmap_from_DF(pdDf, title="Confusion Matrix", ax = None, Accuracy=None, cohenKappaScore=None, printTitle=False): - subTitleStr= '➤ Model Accuracy:{:.2f}% '.format(Accuracy) if Accuracy is not None else "" - subTitleStr+='➤ Cohen Kappa Score {:.3f}'.format(cohenKappaScore) if cohenKappaScore is not None else "" + subTitleStr= '\u25A0 Model Accuracy:{:.2f}% '.format(Accuracy) if Accuracy is not None else "" + subTitleStr+='\u25A0 Cohen Kappa Score {:.3f}'.format(cohenKappaScore) if cohenKappaScore is not None else "" if ax is None: fig,ax = plt.figure(figsize=(10, 10)) sns.heatmap(pdDf, annot=True, cmap=plt.get_cmap('PuRd') , ax =ax) @@ -619,15 +678,12 @@ def _Plot_Heatmap_from_DF(pdDf, title="Confusion Matrix", ax = None, Accuracy=No return ax -def plot_confusion_matrix(model=None, test_data=None, test_labels =None, labelNames=None, title='Confusion Matrix', predictions=None, showPlot=True, returnPlot=False): - if predictions is None: - test_predictions = np.argmax(model.predict(test_data), axis=-1) - else: - test_predictions = np.argmax(predictions, axis=-1) +def plot_confusion_matrix(y_true, y_pred, labelNames=None, title='Confusion Matrix', showPlot=True, returnPlot=False): + predictions = np.argmax(y_pred, axis=-1) #print(classify_report(test_labels,test_predictions)) - cohenKappaScore = cohen_kappa_score(test_labels,test_predictions) + cohenKappaScore = cohen_kappa_score(y_true,predictions) #print('Cohen kappa Score:', cohenKappaScore) - con_mat = confusion_matrix(test_labels,test_predictions) + con_mat = confusion_matrix(y_true,predictions) con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2) con_mat_norm_df = pd.DataFrame(con_mat_norm, @@ -639,12 +695,92 @@ def plot_confusion_matrix(model=None, test_data=None, test_labels =None, labelNa plt.title("AI-Bharata MedicalAI\n\n", loc='center', fontsize=18,color='grey') - plt.title('{:}\n➤ Model Accuracy:{:.2f}% ➤ Cohen Kappa Score {:.3f}'.format(title,float(get_accuracy_score(test_labels, test_predictions)*100),cohenKappaScore),loc='left', fontsize=13, ) + plt.title('{:}\n\u25A0 Model Accuracy:{:.2f}% \u25A0 Cohen Kappa Score {:.3f}'.format(title,float(get_accuracy_score(y_true, predictions)*100),cohenKappaScore),loc='left', fontsize=13, ) plt.ylabel('True label',fontsize=18) plt.xlabel('Predicted label',fontsize=18) if showPlot: plt.show() if returnPlot: - return test_predictions,figure + return figure else: - return test_predictions + return None + + +from sklearn.metrics import multilabel_confusion_matrix +import sys + +def _plot_cm_single_with_annot(cm, classes, title, subtitle, ax): + cm_sum = np.sum(cm, axis=1, keepdims=True) + cm_perc = cm / cm_sum.astype(float) * 100 + annot = np.empty_like(cm).astype(str) + nrows, ncols = cm.shape + for i in range(nrows): + for j in range(ncols): + c = cm[i, j] + p = cm_perc[i, j] + if i == j: + s = cm_sum[i] + annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s) + elif c == 0: + annot[i, j] = '' + else: + annot[i, j] = '%.1f%%\n%d' % (p, c) + annot[0,0] = annot[0,0]+ '\nTN' + annot[0,1] = annot[0,1]+ '\nFP' + annot[1,0] = annot[1,0]+ '\nFN' + annot[1,1] = annot[1,1]+ '\nTP' + + cm = pd.DataFrame(cm, index=classes, columns=classes) + sns.heatmap(cm_perc, cmap= plt.get_cmap('RdPu'), annot=annot, fmt='', ax=ax, cbar=False) + + #tick_marks = np.arange(len(classes)) + #ax.set_xticks(tick_marks), ax.xaxis.set_ticklabels(classes) + #ax.set_yticks(tick_marks), ax.yaxis.set_ticklabels(classes) + ax.set_xlabel('Predicted Label') + ax.set_ylabel('True Label') + ax.set_title(title) + ax.set_title("{:}\n".format(title), loc='center', fontsize=18,color='blue') + ax.set_title('{}'.format(subtitle),loc='left', fontsize=13, ) + ax.grid(False) + +def plot_multilabel_confusion_matrix(y_true, y_pred, class_names, numPages= 2, rows_per_page=4, ncols=4 ): + + totalPlots = len(class_names) + + plotsPerPage = rows_per_page*ncols + + nrows = rows_per_page*numPages + plotsPerPage = plotsPerPage if plotsPerPage%nrows==0 else int(nrows - plotsPerPage%nrows + plotsPerPage) + mcm = multilabel_confusion_matrix(y_true, y_pred) + myFigs = [] + for x in range(numPages): + #fig, axes = plt.subplots(int(np.ceil(len(class_names) / nrows)), ncols, figsize=(20, 15)) + fig, axes = plt.subplots(int(np.ceil(nrows/numPages)), ncols, figsize=(20, 10)) + axes = axes.flatten() + try: + part_mcm = mcm[x*plotsPerPage:(x+1)*plotsPerPage] + except: + part_mcm = mcm[x*plotsPerPage:-1] + if part_mcm.shape[0] != len(axes): + for x in range(len(axes)-part_mcm.shape[0]): + fig.delaxes(axes[-1-x]) + + for i, conf_matrix in enumerate(part_mcm): + labelIDX = int(i+x*plotsPerPage) + tn, fp, fn, tp = conf_matrix.ravel() + f1 = 2 * tp / (2 * tp + fp + fn + sys.float_info.epsilon) + recall = tp / (tp + fn + sys.float_info.epsilon) + precision = tp / (tp + fp + sys.float_info.epsilon) + csi_ts = tp/(tp + fn + fp) + #print(class_names[labelIDX]) + _plot_cm_single_with_annot( + conf_matrix, + classes=['Absent', 'Present'], + title=('{}'.format(class_names[labelIDX])), + subtitle = ('\u25A0 f1 - {:.2f} \u25A0 Sensitivity - {:.2f} \u25A0 PPV - {:.2f}'.format(f1,recall,precision)), + ax=axes[i] + ) + plt.tight_layout(pad=2) + + myFigs.append(fig) + return myFigs \ No newline at end of file diff --git a/medicalai/chief/networks.py b/medicalai/chief/networks.py index 008d220..3b02082 100644 --- a/medicalai/chief/networks.py +++ b/medicalai/chief/networks.py @@ -68,6 +68,11 @@ class tinyMedNet(NetworkInit): Additionally this acts a starting point for example/tutorial for getting started to know the Medical AI library. """ def call(self, inputSize, outputSize, **kwargs): + try: + finalActivation = kwargs["finalActivation"] + print('[INFO]: {} Final Activation Set to {}'.format(self.__class__.__name__,finalActivation)) + except: + finalActivation = 'softmax' try: model = Sequential([ Conv2D(64, kernel_size=(5, 5), strides=(1, 1),activation='relu', padding = 'valid',input_shape=inputSize, name='CNN1'), @@ -79,7 +84,7 @@ def call(self, inputSize, outputSize, **kwargs): Flatten(), Dense(384, activation='relu', name='FC1'), Dense(192, activation='relu', name='FC2'), - Dense(outputSize, activation='softmax', name='FC3') + Dense(outputSize, activation=finalActivation, name='FC3') ]) except ValueError: model = Sequential([ @@ -109,6 +114,11 @@ def call(self, inputSize, outputSize, **kwargs): print('convLayers Not Passed in Network Parameters. Pass the parameter using **ai_params') print('Using Default 3 convLayers') convLayers = 3 + try: + finalActivation = kwargs["finalActivation"] + print('[INFO]: {} Final Activation Set to {}'.format(self.__class__.__name__,finalActivation)) + except: + finalActivation = 'softmax' try: model = Sequential() model.add(Conv2D(64, kernel_size=(3, 3), strides=(1, 1),activation='relu', padding = 'valid',input_shape=inputSize, name='CNN1')) @@ -119,7 +129,7 @@ def call(self, inputSize, outputSize, **kwargs): model.add(Flatten()) model.add(Dense(384, activation='relu', name='FC1')) model.add(Dense(192, activation='relu', name='FC2')) - model.add(Dense(outputSize, activation='softmax', name='FC3')) + model.add(Dense(outputSize, activation=finalActivation, name='FC3')) return model except ValueError: print(20*'-') @@ -139,6 +149,12 @@ def call(self, inputSize, outputSize, **kwargs): print('convLayers Not Passed in Network Parameters. Pass the parameter using **ai_params') print('Using Default 3 convLayers') convLayers = 3 + try: + finalActivation = kwargs["finalActivation"] + print('[INFO]: {} Final Activation Set to {}'.format(self.__class__.__name__,finalActivation)) + except: + finalActivation = 'softmax' + try: model = Sequential() model.add(Conv2D(64, kernel_size=(3, 3), strides=(1, 1),activation='relu', padding = 'valid',input_shape=inputSize, name='CNN1')) @@ -154,7 +170,7 @@ def call(self, inputSize, outputSize, **kwargs): model.add(Dropout(rate=0.5)) model.add(Dense(192, activation='relu', name='FC3')) model.add(Dropout(rate=0.5)) - model.add(Dense(outputSize, activation='softmax', name='FC4')) + model.add(Dense(outputSize, activation=finalActivation, name='FC4')) return model except ValueError as err: print(err) @@ -217,7 +233,8 @@ class DenseNet121(NetworkInit): outputSize: Number of classes for prediction """ def call(self, inputSize, outputSize, **kwargs): - return densenet.DenseNet121_Model(img_input=inputSize,classes=outputSize) + finalActivation = kwargs["finalActivation"] if 'finalActivation' in kwargs else "softmax" + return densenet.DenseNet121_Model(img_input=inputSize,classes=outputSize,finalActivation=finalActivation) class VGG16(NetworkInit): """ diff --git a/medicalai/chief/nnets/covid_net.py b/medicalai/chief/nnets/covid_net.py index a245887..9c93977 100644 --- a/medicalai/chief/nnets/covid_net.py +++ b/medicalai/chief/nnets/covid_net.py @@ -11,10 +11,11 @@ def PEPXModel(input_tensor, filters, name): return x -def COVIDNET_Keras(img_input=(224, 224, 3), classes =4): +def COVIDNET_Keras(img_input=(224, 224, 3), classes =4, **kwargs): """This is a tensorflow 2.0 network variant for COVID-Net described in Paper "COVID-Net: A Tailored Deep Convolutional Neural Network Design for Detection of COVID-19 Cases from Chest Radiography Images" by Linda Wang et al. Reference: https://github.com/busyyang/COVID-19/ """ + finalActivation=kwargs['finalActivation'] if 'finalActivation' in kwargs else 'softmax' input = Input(shape=img_input, name='input') x = Conv2D(input_shape=img_input, filters=64, kernel_size=(7, 7), activation='relu', padding='same', strides=(2, 2))(input) @@ -62,7 +63,7 @@ def COVIDNET_Keras(img_input=(224, 224, 3), classes =4): fla = Flatten()(add([y_4_1, y_4_2, y_4_3, p_4_y])) d1 = Dense(1024, activation='relu')(fla) d2 = Dense(256, activation='relu')(d1) - output = Dense(classes, activation='softmax')(d2) + output = Dense(classes, activation=finalActivation)(d2) return tf.keras.models.Model(input, output) diff --git a/medicalai/chief/nnets/densenet.py b/medicalai/chief/nnets/densenet.py index cb1c8dc..7bdd818 100644 --- a/medicalai/chief/nnets/densenet.py +++ b/medicalai/chief/nnets/densenet.py @@ -1,7 +1,7 @@ import tensorflow as tf -def DenseNet121_Model(img_input=(224,224,3),classes=3): +def DenseNet121_Model(img_input=(224,224,3),classes=3, **kwargs): """ Loaded the DenseNet121 network, ensuring the head FC layer sets are left off @@ -13,6 +13,8 @@ def DenseNet121_Model(img_input=(224,224,3),classes=3): Returns : model """ + finalActivation=kwargs['finalActivation'] if 'finalActivation' in kwargs else 'softmax' + trainAllLayers=kwargs['trainAllLayers'] if 'trainAllLayers' in kwargs else False baseModel = tf.keras.applications.DenseNet121(weights="imagenet", include_top=False, input_tensor=tf.keras.layers.Input(shape=img_input)) # construct the head of the model that will be placed on top of the the base model @@ -21,12 +23,12 @@ def DenseNet121_Model(img_input=(224,224,3),classes=3): output = tf.keras.layers.Flatten(name="flatten")(output) output = tf.keras.layers.Dense(512, activation="relu")(output) output = tf.keras.layers.Dropout(0.25)(output) - output = tf.keras.layers.Dense(classes, activation="softmax")(output) + output = tf.keras.layers.Dense(classes, activation=finalActivation)(output) # place the head FC model on top of the base model (this will become the actual model we will train) model = tf.keras.Model(inputs=baseModel.input, outputs=output) # loop over all layers in the base model and freeze them so they will not be updated during the first training process for layer in baseModel.layers: - layer.trainable = False + layer.trainable = trainAllLayers return model if __name__ == '__main__': diff --git a/medicalai/chief/nnets/inceptionResnet.py b/medicalai/chief/nnets/inceptionResnet.py index a2514e1..0fe6c3c 100644 --- a/medicalai/chief/nnets/inceptionResnet.py +++ b/medicalai/chief/nnets/inceptionResnet.py @@ -1,7 +1,7 @@ import tensorflow as tf -def InceptionResNetV2_Model(img_input=(224,224,3),classes=3): +def InceptionResNetV2_Model(img_input=(224,224,3),classes=3,**kwargs): """ Loaded the InceptionResNetV2 network, ensuring the head FC layer sets are left off @@ -13,6 +13,8 @@ def InceptionResNetV2_Model(img_input=(224,224,3),classes=3): Returns : model """ + finalActivation=kwargs['finalActivation'] if 'finalActivation' in kwargs else 'softmax' + trainAllLayers=kwargs['trainAllLayers'] if 'trainAllLayers' in kwargs else False baseModel = tf.keras.applications.InceptionResNetV2(weights="imagenet", include_top=False, input_tensor=tf.keras.layers.Input(shape=img_input)) # construct the head of the model that will be placed on top of the the base model @@ -21,12 +23,12 @@ def InceptionResNetV2_Model(img_input=(224,224,3),classes=3): output = tf.keras.layers.Flatten(name="flatten")(output) output = tf.keras.layers.Dense(512, activation="relu")(output) output = tf.keras.layers.Dropout(0.25)(output) - output = tf.keras.layers.Dense(classes, activation="softmax")(output) + output = tf.keras.layers.Dense(classes, activation=finalActivation)(output) # place the head FC model on top of the base model (this will become the actual model we will train) model_full = tf.keras.Model(inputs=baseModel.input, outputs=output) # loop over all layers in the base model and freeze them so they will not be updated during the first training process for layer in baseModel.layers: - layer.trainable = False + layer.trainable = trainAllLayers return model_full if __name__ == '__main__': diff --git a/medicalai/chief/nnets/inceptionv3.py b/medicalai/chief/nnets/inceptionv3.py index cc4b780..77c5d31 100644 --- a/medicalai/chief/nnets/inceptionv3.py +++ b/medicalai/chief/nnets/inceptionv3.py @@ -1,7 +1,7 @@ import tensorflow as tf -def InceptionV3(img_input=(224,224,3),classes=3): +def InceptionV3(img_input=(224,224,3),classes=3, **kwargs): """ Loaded the InceptionV3 network, ensuring the head FC layer sets are left off @@ -13,6 +13,8 @@ def InceptionV3(img_input=(224,224,3),classes=3): Returns : model """ + finalActivation=kwargs['finalActivation'] if 'finalActivation' in kwargs else 'softmax' + trainAllLayers=kwargs['trainAllLayers'] if 'trainAllLayers' in kwargs else False baseModel = tf.keras.applications.InceptionV3(weights="imagenet", include_top=False, input_tensor=tf.keras.layers.Input(shape=img_input)) # construct the head of the model that will be placed on top of the the base model @@ -21,12 +23,12 @@ def InceptionV3(img_input=(224,224,3),classes=3): output = tf.keras.layers.Flatten(name="flatten")(output) output = tf.keras.layers.Dense(512, activation="relu")(output) output = tf.keras.layers.Dropout(0.25)(output) - output = tf.keras.layers.Dense(classes, activation="softmax")(output) + output = tf.keras.layers.Dense(classes, activation=finalActivation)(output) # place the head FC model on top of the base model (this will become the actual model we will train) model_full = tf.keras.Model(inputs=baseModel.input, outputs=output) # loop over all layers in the base model and freeze them so they will not be updated during the first training process for layer in baseModel.layers: - layer.trainable = False + layer.trainable = trainAllLayers return model_full if __name__ == '__main__': diff --git a/medicalai/chief/nnets/mobilenet.py b/medicalai/chief/nnets/mobilenet.py index 9312825..9503113 100644 --- a/medicalai/chief/nnets/mobilenet.py +++ b/medicalai/chief/nnets/mobilenet.py @@ -1,7 +1,7 @@ import tensorflow as tf -def MobileNet(img_input=(224,224,3),classes=3): +def MobileNet(img_input=(224,224,3),classes=3, **kwargs): """ Loaded the MobileNet network, ensuring the head FC layer sets are left off @@ -13,6 +13,8 @@ def MobileNet(img_input=(224,224,3),classes=3): Returns : model """ + finalActivation=kwargs['finalActivation'] if 'finalActivation' in kwargs else 'softmax' + trainAllLayers=kwargs['trainAllLayers'] if 'trainAllLayers' in kwargs else False baseModel = tf.keras.applications.MobileNet(weights="imagenet", include_top=False, input_tensor=tf.keras.layers.Input(shape=img_input)) # construct the head of the model that will be placed on top of the the base model @@ -21,12 +23,12 @@ def MobileNet(img_input=(224,224,3),classes=3): output = tf.keras.layers.Flatten(name="flatten")(output) output = tf.keras.layers.Dense(512, activation="relu")(output) output = tf.keras.layers.Dropout(0.25)(output) - output = tf.keras.layers.Dense(classes, activation="softmax")(output) + output = tf.keras.layers.Dense(classes, activation=finalActivation)(output) # place the head FC model on top of the base model (this will become the actual model we will train) model_full = tf.keras.Model(inputs=baseModel.input, outputs=output) # loop over all layers in the base model and freeze them so they will not be updated during the first training process for layer in baseModel.layers: - layer.trainable = False + layer.trainable = trainAllLayers return model_full if __name__ == '__main__': diff --git a/medicalai/chief/nnets/mobilenetv2.py b/medicalai/chief/nnets/mobilenetv2.py index 49e2929..fa53060 100644 --- a/medicalai/chief/nnets/mobilenetv2.py +++ b/medicalai/chief/nnets/mobilenetv2.py @@ -1,7 +1,7 @@ import tensorflow as tf -def MobileNetV2(img_input=(224,224,3),classes=3): +def MobileNetV2(img_input=(224,224,3),classes=3,**kwargs): """ Loaded the MobileNetV2 network, ensuring the head FC layer sets are left off @@ -13,6 +13,8 @@ def MobileNetV2(img_input=(224,224,3),classes=3): Returns : model """ + finalActivation=kwargs['finalActivation'] if 'finalActivation' in kwargs else 'softmax' + trainAllLayers=kwargs['trainAllLayers'] if 'trainAllLayers' in kwargs else False baseModel = tf.keras.applications.MobileNetV2(weights="imagenet", include_top=False, input_tensor=tf.keras.layers.Input(shape=img_input)) # construct the head of the model that will be placed on top of the the base model @@ -21,12 +23,12 @@ def MobileNetV2(img_input=(224,224,3),classes=3): output = tf.keras.layers.Flatten(name="flatten")(output) output = tf.keras.layers.Dense(512, activation="relu")(output) output = tf.keras.layers.Dropout(0.25)(output) - output = tf.keras.layers.Dense(classes, activation="softmax")(output) + output = tf.keras.layers.Dense(classes, activation=finalActivation)(output) # place the head FC model on top of the base model (this will become the actual model we will train) model_full = tf.keras.Model(inputs=baseModel.input, outputs=output) # loop over all layers in the base model and freeze them so they will not be updated during the first training process for layer in baseModel.layers: - layer.trainable = False + layer.trainable = trainAllLayers return model_full if __name__ == '__main__': diff --git a/medicalai/chief/nnets/resnet.py b/medicalai/chief/nnets/resnet.py index 242af3a..9013939 100644 --- a/medicalai/chief/nnets/resnet.py +++ b/medicalai/chief/nnets/resnet.py @@ -13,6 +13,7 @@ def resnet(img_input=(224,224,3),classes=3,name = 'ResNet50', **kwargs): Returns : model """ + finalActivation=kwargs['finalActivation'] if 'finalActivation' in kwargs else 'softmax' tfAPP = tf.keras.applications resnetMethods = [x for x in dir(tfAPP) if 'ResNet' in x and not 'Inception' in x] appsLink = 'tf.keras.applications.' @@ -32,7 +33,7 @@ def resnet(img_input=(224,224,3),classes=3,name = 'ResNet50', **kwargs): output = tf.keras.layers.Flatten(name="flatten")(output) output = tf.keras.layers.Dense(512, activation="relu")(output) output = tf.keras.layers.Dropout(0.25)(output) - output = tf.keras.layers.Dense(classes, activation="softmax")(output) + output = tf.keras.layers.Dense(classes, activation=finalActivation)(output) # place the head FC model on top of the base model (this will become the actual model we will train) model_full = tf.keras.Model(inputs=baseModel.input, outputs=output) # loop over all layers in the base model and freeze them so they will not be updated during the first training process @@ -41,5 +42,5 @@ def resnet(img_input=(224,224,3),classes=3,name = 'ResNet50', **kwargs): return model_full if __name__ == '__main__': - model = MobileNet() + model = resnet() model.summary() diff --git a/medicalai/chief/nnets/vgg16.py b/medicalai/chief/nnets/vgg16.py index de21025..26a5778 100644 --- a/medicalai/chief/nnets/vgg16.py +++ b/medicalai/chief/nnets/vgg16.py @@ -1,7 +1,7 @@ import tensorflow as tf -def VGG16_Model(img_input=(224,224,3),classes=3): +def VGG16_Model(img_input=(224,224,3),classes=3, **kwargs): """ Loaded the VGG16 network, ensuring the head FC layer sets are left off @@ -13,6 +13,8 @@ def VGG16_Model(img_input=(224,224,3),classes=3): Returns : model """ + finalActivation=kwargs['finalActivation'] if 'finalActivation' in kwargs else 'softmax' + trainAllLayers=kwargs['trainAllLayers'] if 'trainAllLayers' in kwargs else False baseModel = tf.keras.applications.VGG16(weights="imagenet", include_top=False, input_tensor=tf.keras.layers.Input(shape=img_input)) # construct the head of the model that will be placed on top of the the base model @@ -21,12 +23,12 @@ def VGG16_Model(img_input=(224,224,3),classes=3): output = tf.keras.layers.Flatten(name="flatten")(output) output = tf.keras.layers.Dense(512, activation="relu")(output) output = tf.keras.layers.Dropout(0.25)(output) - output = tf.keras.layers.Dense(classes, activation="softmax")(output) + output = tf.keras.layers.Dense(classes, activation=finalActivation)(output) # place the head FC model on top of the base model (this will become the actual model we will train) model_full = tf.keras.Model(inputs=baseModel.input, outputs=output) # loop over all layers in the base model and freeze them so they will not be updated during the first training process for layer in baseModel.layers: - layer.trainable = False + layer.trainable = trainAllLayers return model_full if __name__ == '__main__': diff --git a/medicalai/chief/nnets/xception.py b/medicalai/chief/nnets/xception.py index f582675..ea42284 100644 --- a/medicalai/chief/nnets/xception.py +++ b/medicalai/chief/nnets/xception.py @@ -1,7 +1,7 @@ import tensorflow as tf -def Xception(img_input=(224,224,3),classes=3): +def Xception(img_input=(224,224,3),classes=3, **kwargs): """ Loaded the Xception network, ensuring the head FC layer sets are left off @@ -13,6 +13,8 @@ def Xception(img_input=(224,224,3),classes=3): Returns : model """ + finalActivation=kwargs['finalActivation'] if 'finalActivation' in kwargs else 'softmax' + trainAllLayers=kwargs['trainAllLayers'] if 'trainAllLayers' in kwargs else False baseModel = tf.keras.applications.Xception(weights="imagenet", include_top=False, input_tensor=tf.keras.layers.Input(shape=img_input)) # construct the head of the model that will be placed on top of the the base model @@ -21,12 +23,12 @@ def Xception(img_input=(224,224,3),classes=3): output = tf.keras.layers.Flatten(name="flatten")(output) output = tf.keras.layers.Dense(512, activation="relu")(output) output = tf.keras.layers.Dropout(0.25)(output) - output = tf.keras.layers.Dense(classes, activation="softmax")(output) + output = tf.keras.layers.Dense(classes, activation=finalActivation)(output) # place the head FC model on top of the base model (this will become the actual model we will train) model_full = tf.keras.Model(inputs=baseModel.input, outputs=output) # loop over all layers in the base model and freeze them so they will not be updated during the first training process for layer in baseModel.layers: - layer.trainable = False + layer.trainable = trainAllLayers return model_full if __name__ == '__main__': diff --git a/medicalai/chief/uFuncs.py b/medicalai/chief/uFuncs.py index 2ceb159..db50a39 100644 --- a/medicalai/chief/uFuncs.py +++ b/medicalai/chief/uFuncs.py @@ -11,4 +11,28 @@ def ntimes(*args, **kw): print('%r %2.2f ms' % \ (func.__name__, (end - start) * 1000)) return value - return ntimes \ No newline at end of file + return ntimes + + + +from tqdm import tqdm +def calculate_ips(ds, steps=100, batch_size=32, verbose=False): + start = time.time() + it = iter(ds) + if verbose: + for i in tqdm(range(steps)): + batch = next(it) + if i%10 == 0: + print('.',end='') + else: + for i in range(steps): + batch = next(it) + if i%10 == 0: + print('.',end='') + print() + end = time.time() + + duration = end-start + IPS = batch_size*steps/duration + print("{} batches: {:.2f}s {:0.3f} Images/s".format(steps, duration, IPS)) + return IPS, duration \ No newline at end of file diff --git a/setup.py b/setup.py index 25fc06e..de50feb 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,9 @@ author=about['__author__'], author_email='contact@aibharata.com', license=about['__license__'], - install_requires=['pandas','tensorflow','tf_explain','numpy', 'pillow','tqdm', 'matplotlib', 'plotly', 'pandas', 'seaborn', 'sklearn', 'pydicom'], + install_requires=['pandas','tensorflow', + 'tf_explain','numpy', 'pillow','tqdm', 'cmapy', 'albumentations', + 'matplotlib', 'plotly', 'pandas', 'seaborn', 'sklearn', 'pydicom'], packages=find_packages(), include_package_data=True, package_data={