Major Changes

Added New Dataprocessors
aibharata · Jul 18, 2020 · 686e172 · 686e172
1 parent 3cfd2c1
commit 686e172
Show file tree

Hide file tree

Showing 26 changed files with 1,217 additions and 149 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,3 +1,2 @@
 {
-    "python.pythonPath": "C:\\Python35\\python3.exe"
 }
diff --git a/medicalai/__about__.py b/medicalai/__about__.py
@@ -14,5 +14,5 @@
 
 __project__ = "medicalai"
 __author__ = "Vinayaka Jyothi - For AiBharata"
-__version__ = "1.1.59"
+__version__ = "1.2.2-rc"
 __license__ = "Apache"
diff --git a/medicalai/chief/__init__.py b/medicalai/chief/__init__.py
@@ -22,4 +22,6 @@
 from . import nnets
 from . import model_metrics
 from . import xai
-from . import dataset_analysis as dataAnalyzer
+from . import dataset_analysis as dataAnalyzer
+from . import callbacks as callbacks
+from . import dataloaders as dataloader
diff --git a/medicalai/chief/callbacks/__init__.py b/medicalai/chief/callbacks/__init__.py
@@ -0,0 +1,17 @@
+#    Copyright 2020-2022 AIBharata Emerging Technologies Pvt. Ltd.
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+from __future__ import absolute_import
+
+from .custom_callbacks import * 
diff --git a/medicalai/chief/callbacks/custom_callbacks.py b/medicalai/chief/callbacks/custom_callbacks.py
@@ -0,0 +1,109 @@
+from __future__ import absolute_import
+from tensorflow.keras.callbacks import Callback
+from sklearn.metrics import roc_auc_score
+
+class AUROC_Callback(Callback):
+    def __init__(self, generator, workers=1):
+        super().__init__()
+        self.generator = generator
+        self.workers = workers
+
+    def on_epoch_end(self, epoch, logs=None):
+        y_pred = self.model.predict(self.generator, workers=self.workers)
+        y_true= self.generator.labels
+        meanAUROC = roc_auc_score(y_true,y_pred)
+        print(' - mAUROC:', meanAUROC)
+
+class MultipleClassAUROC(Callback):
+    '''
+    Sample Usage: 
+    auroc = MultipleClassAUROC(
+            sequence=validation_sequence,
+            class_names=class_names,
+            weights_path=output_weights_path,
+            stats=training_stats,
+            workers=generator_workers,
+        ) 
+    '''
+    def __init__(self, sequence, class_names, weights_path, stats=None, workers=1):
+        super(Callback, self).__init__()
+        self.sequence = sequence
+        self.workers = workers
+        self.class_names = class_names
+        self.weights_path = weights_path
+        self.best_weights_path = os.path.join(
+            os.path.split(weights_path)[0],
+            "best_{}".format(os.path.split(weights_path)[1]),
+        )
+        self.best_auroc_log_path = os.path.join(
+            os.path.split(weights_path)[0],
+            "best_auroc.log",
+        )
+        self.stats_output_path = os.path.join(
+            os.path.split(weights_path)[0],
+            ".training_stats.json"
+        )
+
+        # for resuming previous training
+        if stats:
+            self.stats = stats
+        else:
+            self.stats = {"best_mean_auroc": 0}
+
+        # aurocs log
+        self.aurocs = {}
+        for c in self.class_names:
+            self.aurocs[c] = []
+
+    def on_epoch_end(self, epoch, logs={}):
+        """
+        Calculate the average AUROC and save the best model weights according
+        to this metric.
+
+        """
+        print("\n*********************************")
+        self.stats["lr"] = float(kb.eval(self.model.optimizer.lr))
+        print("current learning rate: {}".format(self.stats['lr']))
+
+        """
+        y_hat shape: (#samples, len(class_names))
+        y: [(#samples, 1), (#samples, 1) ... (#samples, 1)]
+        """
+        y_hat = self.model.predict(self.sequence, workers=self.workers)
+        y = self.sequence.get_y_true()
+
+        print("*** epoch#{} dev auroc ***".format(epoch + 1))
+        current_auroc = []
+        for i in range(len(self.class_names)):
+            try:
+                score = roc_auc_score(y[:, i], y_hat[:, i])
+            except ValueError:
+                score = 0
+            self.aurocs[self.class_names[i]].append(score)
+            current_auroc.append(score)
+            print("{}. {}: {}".foramt(i+1,self.class_names[i],score))
+        print("*********************************")
+
+        # customize your multiple class metrics here
+        mean_auroc = np.mean(current_auroc)
+        print("mean auroc: {}".format(mean_auroc))
+        if mean_auroc > self.stats["best_mean_auroc"]:
+            print("update best auroc from {} to {}".format(self.stats['best_mean_auroc'],mean_auroc))
+
+            # 1. copy best model
+            shutil.copy(self.weights_path, self.best_weights_path)
+
+            # 2. update log file
+            print("update log file: {}".format(self.best_auroc_log_path))
+            with open(self.best_auroc_log_path, "a") as f:
+                f.write("(epoch#{}) auroc: {}, lr: {}\n".format(epoch + 1,mean_auroc,self.stats['lr']))
+
+            # 3. write stats output, this is used for resuming the training
+            with open(self.stats_output_path, 'w') as f:
+                json.dump(self.stats, f)
+
+            print("update model file: {} -> {}".format(self.weights_path, self.best_weights_path))
+            self.stats["best_mean_auroc"] = mean_auroc
+            print("*********************************")
+        return
+
diff --git a/medicalai/chief/core.py b/medicalai/chief/core.py
@@ -35,10 +35,13 @@
 from .model_metrics import *
 from .xai import *
 from .uFuncs import *
+from albumentations import Compose
+import albumentations.augmentations.transforms as augmentations
 
 physical_devices = tf.config.list_physical_devices('GPU')
 if len(physical_devices)>1:
 	MULTI_GPU_MODE= True
+	print('[INFO]: Medicalai activated with MultiGPU Mode')
 else:
 	MULTI_GPU_MODE= False
 GPU_to_Use = 'all'	
@@ -196,7 +199,7 @@ def train(	model, x_train,
 			class_weights = None, 
 			saveBestModel = False, bestModelCond = None, 
 			validation_data = None, TRAIN_STEPS = None, TEST_STEPS = None, 
-			verbose=None, y_train=None, 
+			verbose=None, y_train=None, workers = 1 
 		  ):
 	if callbacks is not None:
 		if ('tensorboard'in callbacks):
@@ -224,7 +227,8 @@ def train(	model, x_train,
 				epochs=epochs,
 				validation_data=validation_data,
 				callbacks=callbacks,
-				class_weight = class_weights
+				class_weight = class_weights,
+				workers =workers
 				)
 	else:
 		result = model.fit(x_train,
@@ -233,7 +237,8 @@ def train(	model, x_train,
 				validation_data=validation_data,
 				callbacks=callbacks,
 				verbose = verbose,
-				class_weight = class_weights
+				class_weight = class_weights,
+				workers = workers
 				)		
 	return result.history
 
@@ -492,7 +497,7 @@ def preprocessor_from_meta(self, metaFile=None):
 			self.labelNames = self.preProcessor.labels
 
 	#@timeit
-	def predict(self, input, verbose=0):
+	def predict(self, input, verbose=1, safe=False  , workers= 1):
 		"""
 		Peform prediction on Input. Input can be Numpy Array or Image or Data Generator (in case of Test/Validation). 
 
@@ -519,24 +524,34 @@ def predict(self, input, verbose=0):
 		# Returns
 			Numpy.Array: of Predictions. Shape of Output [Number of Inputs, Number of Output Classes in Model]
 		"""
-		if hasattr(input, 'generator') and hasattr(input, 'STEP_SIZE'):
-			return self.model.predict(input.generator, verbose=1)
-		elif hasattr(input, 'image_data_generator'):
-			return self.model.predict(input,  verbose=1)
-		elif hasattr(input, 'data') and not isinstance(input,np.ndarray):
-			return self.model.predict(input.data, verbose=verbose)
+		if hasattr(self, 'workers'):
+			workers = self.workers  
 		else:
-			if self.preProcessor is not None:
-				input = self.preProcessor.processImage(input)
-				return self.model.predict(input, verbose=verbose)
+			workers = workers
+		if safe:
+			if hasattr(input, 'generator') and hasattr(input, 'STEP_SIZE'):
+				return self.model.predict(input.generator, steps=input.STEP_SIZE, verbose=1, workers=workers)
+			elif hasattr(input, 'image_data_generator'):
+				return self.model.predict(input,  steps =(input.n/input.batch_size), verbose=1, workers=workers)
+		else:			
+			if hasattr(input, 'generator') and hasattr(input, 'STEP_SIZE'):
+				return self.model.predict(input.generator, verbose=1, workers=workers)
+			elif hasattr(input, 'image_data_generator'):
+				return self.model.predict(input,  verbose=1, workers=workers)
+			elif hasattr(input, 'data') and not isinstance(input,np.ndarray):
+				return self.model.predict(input.data, verbose=verbose, workers=workers)
 			else:
-				if self.labelNames is None:
-					if hasattr(input, 'labelNames'):
-						self.labelNames = input.labelNames if self.labelNames is None else self.labelNames
-				if isinstance(input,np.ndarray):
-					return self.model.predict(input, verbose=verbose)
+				if self.preProcessor is not None:
+					input = self.preProcessor.processImage(input)
+					return self.model.predict(input, verbose=verbose, workers=workers)
 				else:
-					return self.model.predict(input, verbose=verbose)
+					if self.labelNames is None:
+						if hasattr(input, 'labelNames'):
+							self.labelNames = input.labelNames if self.labelNames is None else self.labelNames
+					if isinstance(input,np.ndarray):
+						return self.model.predict(input, verbose=verbose, workers=workers)
+					else:
+						return self.model.predict(input, verbose=verbose, workers=workers)
 
 	#@timeit
 	def predict_pipeline(self, input):
@@ -670,7 +685,7 @@ def summary(self):
 		"""	
 		return self.model.summary()
 
-	def generate_evaluation_report(self, testSet = None, predictions = None, printStat = False,returnPlot = False, showPlot= False, pdfName =None, **kwargs):
+	def generate_evaluation_report(self, testSet = None, predictions = None, printStat = True,returnPlot = False, showPlot= False, pdfName =None, **kwargs):
 		"""
 		Generate a comprehensive PDF report with model sensitivity, specificity, accuracy, confidence intervals,
 		ROC Curve Plot, Precision Recall Curve Plot, and Confusion Matrix Plot for each class.
@@ -800,12 +815,12 @@ class TRAIN_ENGINE(INFERENCE_ENGINE):
 	"""
 	def __init__(self, modelName=None):
 		super().__init__(modelName)
-
+		
 	def train_and_save_model(self,AI_NAME, MODEL_SAVE_NAME, trainSet, testSet, OUTPUT_CLASSES, RETRAIN_MODEL,  EPOCHS, 
 							BATCH_SIZE=32, LEARNING_RATE=0.0001, convLayers=None,SAVE_BEST_MODEL=False, BEST_MODEL_COND=None, 
 							callbacks=None, loss = 'sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(lr=0.0001),
-							metrics = ['accuracy'], showModel = False,
-							CLASS_WEIGHTS=None, **kwargs):
+							metrics = ['accuracy'], showModel = False, workers = 1,
+							CLASS_WEIGHTS=None, **kwargs,):
 		""""
 		Main function that trains and saves a model. This automatically builds new model for given networks/AI or reload existing AI model. 
 		This function can be used to retrain existing models or create new models.
@@ -880,51 +895,68 @@ def train_and_save_model(self,AI_NAME, MODEL_SAVE_NAME, trainSet, testSet, OUTPU
 			None: On successful completion saves the trained model.
 
 		"""
+		self.workers = workers
 		self.testSet = testSet
 		self.modelName = MODEL_SAVE_NAME
 		self.test_predictions = None
+		global MULTI_GPU_MODE, GPU_to_Use
 		if hasattr(trainSet, 'data'):
 			self.labelNames = trainSet.labelNames
 			if MULTI_GPU_MODE and GPU_to_Use.lower()=='all':
 				mirrored_strategy = tf.distribute.MirroredStrategy()
 				with mirrored_strategy.scope():
 					self.model = modelManager(AI_NAME= AI_NAME, convLayers= convLayers, modelName = MODEL_SAVE_NAME, x_train = trainSet.data, OUTPUT_CLASSES = OUTPUT_CLASSES, RETRAIN_MODEL= RETRAIN_MODEL)
 					self.model.compile(optimizer=optimizer,loss=loss,metrics=metrics)
-				BATCH_SIZE *= mirrored_strategy.num_replicas_in_sync
+				#BATCH_SIZE *= mirrored_strategy.num_replicas_in_sync
 			else:
 					self.model = modelManager(AI_NAME= AI_NAME, convLayers= convLayers, modelName = MODEL_SAVE_NAME, x_train = trainSet.data, OUTPUT_CLASSES = OUTPUT_CLASSES, RETRAIN_MODEL= RETRAIN_MODEL)
 					self.model.compile(optimizer=optimizer,loss=loss,metrics=metrics)				
 			print(self.model.summary()) if showModel else None
 			print('[INFO]: BATCH_SIZE -',BATCH_SIZE)
 			self.result = train(self.model, trainSet.data, y_train= trainSet.labels, batch_size=BATCH_SIZE, epochs=EPOCHS,
 									validation_data=(testSet.data, testSet.labels), callbacks=callbacks, saveBestModel= SAVE_BEST_MODEL, 
-									bestModelCond = BEST_MODEL_COND, TRAIN_STEPS = None, TEST_STEPS = None, 
+									bestModelCond = BEST_MODEL_COND, TRAIN_STEPS = None, TEST_STEPS = None, workers = self.workers,
 									class_weights=CLASS_WEIGHTS)#['tensorboard'])
 			#self.model.evaluate(testSet.data, testSet.labels)
 
 			dataprc.metaSaver(trainSet.labelMap, trainSet.labelNames,  normalize=trainSet.normalize,
 							  rescale =None,
 							  network_input_dim =trainSet.network_input_dim, samplingMethodName=trainSet.samplingMethodName, outputName= MODEL_SAVE_NAME)
 		else:
-			networkDim = np.zeros((1,)+trainSet.generator.image_shape)
+			from tensorflow.python.data.ops.dataset_ops import PrefetchDataset
+			if isinstance(trainSet.generator, PrefetchDataset):
+				for f,l in trainSet.generator.take(1):
+					inpSize = f.numpy().shape
+				networkDim = np.zeros((1,)+inpSize[1:])
+				networkInputSize = inpSize[1:]
+				rescaleValue = 1./255
+			else:
+				networkDim = np.zeros((1,)+trainSet.generator.image_shape)
+				networkInputSize = trainSet.generator.image_shape
+				try:
+					rescaleValue = trainSet.generator.image_data_generator.rescale
+				except:
+					rescaleValue = 1./255
+
 			self.labelNames = dataprc.safe_labelmap_converter(trainSet.labelMap)
 			if MULTI_GPU_MODE and GPU_to_Use.lower()=='all':
 				mirrored_strategy = tf.distribute.MirroredStrategy()
 				with mirrored_strategy.scope():
-					mirrored_strategy = tf.distribute.MirroredStrategy()
 					self.model = modelManager(AI_NAME= AI_NAME, modelName = MODEL_SAVE_NAME, x_train = networkDim, OUTPUT_CLASSES = OUTPUT_CLASSES, RETRAIN_MODEL= RETRAIN_MODEL, **kwargs)
 					self.model.compile(optimizer=optimizer,loss=loss,metrics=metrics)
-				BATCH_SIZE *= mirrored_strategy.num_replicas_in_sync
 			else:
 					self.model = modelManager(AI_NAME= AI_NAME, modelName = MODEL_SAVE_NAME, x_train = networkDim, OUTPUT_CLASSES = OUTPUT_CLASSES, RETRAIN_MODEL= RETRAIN_MODEL, **kwargs)
 					self.model.compile(optimizer=optimizer,loss=loss,metrics=metrics)				
 			print(self.model.summary()) if showModel else None
 			print('[INFO]: BATCH_SIZE -',BATCH_SIZE)
-			self.result = train(self.model, trainSet.generator, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=testSet.generator, callbacks=callbacks, saveBestModel= SAVE_BEST_MODEL, bestModelCond = BEST_MODEL_COND, TRAIN_STEPS = trainSet.STEP_SIZE, TEST_STEPS = testSet.STEP_SIZE, verbose=1,class_weights=CLASS_WEIGHTS)#['tensorboard'])
+			self.result = train(self.model, trainSet.generator, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=testSet.generator, 
+					callbacks=callbacks, saveBestModel= SAVE_BEST_MODEL, bestModelCond = BEST_MODEL_COND, TRAIN_STEPS = trainSet.STEP_SIZE, 
+					TEST_STEPS = testSet.STEP_SIZE, verbose=1,class_weights=CLASS_WEIGHTS, workers = self.workers
+					)
 			#self.model.evaluate(testSet.generator,steps =	testSet.STEP_SIZE)
 			dataprc.metaSaver(trainSet.labelMap, self.labelNames, normalize= None,
-							 rescale = trainSet.generator.image_data_generator.rescale,
-							 network_input_dim =trainSet.generator.image_shape, samplingMethodName=None, outputName= MODEL_SAVE_NAME)
+							 rescale = rescaleValue,
+							 network_input_dim =networkInputSize, samplingMethodName=None, outputName= MODEL_SAVE_NAME)
 
 		save_model_and_weights(self.model, outputName= MODEL_SAVE_NAME)
 

diff --git a/medicalai/chief/dataloaders/__init__.py b/medicalai/chief/dataloaders/__init__.py
@@ -0,0 +1,20 @@
+#    Copyright 2020-2022 AIBharata Emerging Technologies Pvt. Ltd.
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+
+from __future__ import absolute_import
+
+from .data_utils import *
+from .dataset_processors import *
+from .image_sequences import *
+from .tf_image_pipelines import *