diff --git a/Diabetes-prediction b/Diabetes-prediction new file mode 100644 index 0000000..07560ad --- /dev/null +++ b/Diabetes-prediction @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +!apt-get install openjdk-8-jdk-headless -qq > /dev/null +!pip install pyspark==2.4.4 + +"""# Environment Path""" + +import os +os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64' + +"""# Run a SparkSession""" + +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("spark").getOrCreate() + +"""# Clone Diabetes Dataset""" + +! git clone https://github.com/education454/diabetes_dataset + +! ls diabetes_dataset + +df = spark.read.csv('/content/diabetes_dataset/diabetes.csv',header=True,inferSchema=True) + +df.show() + +df.printSchema() + +df.groupby('Outcome').count().show() + +df.describe().show() + +"""# Cleaning Data""" + +for col in df.columns: + print(col+":",df[df[col].isNull()].count()) + +def count_zeros(): + columns_list =['Glucose','BloodPressure','SkinThickness','Insulin','BMI'] + for i in columns_list: + print(i+":",df[df[i]==0].count()) + +count_zeros() + +from pyspark.sql.functions import * +for i in df.columns[1:6]: + data = df.agg({i:'mean'}).first()[0] + print("Mean value for {} is {}".format(i,int(data))) + df = df.withColumn(i,when(df[i]==0,int(data)).otherwise(df[i])) + +df.show() + +"""# Correlation""" + +for col in df.columns: + print("correlation to outcome for {} is {}".format(col,df.stat.corr('Outcome',col))) + +"""# Feature Selection""" + +from pyspark.ml.feature import VectorAssembler +assembler = VectorAssembler(inputCols=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'],outputCol='features') +output_data = assembler.transform(df) + +output_data.printSchema() + +output_data.show() + +"""# Build & Train Model""" + +from pyspark.ml.classification import LogisticRegression +final_data = output_data.select('features','Outcome') + +final_data.printSchema() + +train , test = final_data.randomSplit([0.7,0.3]) +models = LogisticRegression(labelCol='Outcome') +model = models.fit(train) + +summary = model.summary + +summary.predictions.describe().show() + +"""# Evaluation & Test Model""" + +from pyspark.ml.evaluation import BinaryClassificationEvaluator +predictions = model.evaluate(test) + +predictions.predictions.show(20) + +evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Outcome') +evaluator.evaluate(model.transform(test)) + +model.save("model") + +from pyspark.ml.classification import LogisticRegressionModel +model = LogisticRegressionModel.load('model') +