Skip to content

Diabetes Prediction using python #829

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 97 additions & 0 deletions Diabetes-prediction
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# -*- coding: utf-8 -*-

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark==2.4.4

"""# Environment Path"""

import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'

"""# Run a SparkSession"""

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("spark").getOrCreate()

"""# Clone Diabetes Dataset"""

! git clone https://github.com/education454/diabetes_dataset

! ls diabetes_dataset

df = spark.read.csv('/content/diabetes_dataset/diabetes.csv',header=True,inferSchema=True)

df.show()

df.printSchema()

df.groupby('Outcome').count().show()

df.describe().show()

"""# Cleaning Data"""

for col in df.columns:
print(col+":",df[df[col].isNull()].count())

def count_zeros():
columns_list =['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for i in columns_list:
print(i+":",df[df[i]==0].count())

count_zeros()

from pyspark.sql.functions import *
for i in df.columns[1:6]:
data = df.agg({i:'mean'}).first()[0]
print("Mean value for {} is {}".format(i,int(data)))
df = df.withColumn(i,when(df[i]==0,int(data)).otherwise(df[i]))

df.show()

"""# Correlation"""

for col in df.columns:
print("correlation to outcome for {} is {}".format(col,df.stat.corr('Outcome',col)))

"""# Feature Selection"""

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age'],outputCol='features')
output_data = assembler.transform(df)

output_data.printSchema()

output_data.show()

"""# Build & Train Model"""

from pyspark.ml.classification import LogisticRegression
final_data = output_data.select('features','Outcome')

final_data.printSchema()

train , test = final_data.randomSplit([0.7,0.3])
models = LogisticRegression(labelCol='Outcome')
model = models.fit(train)

summary = model.summary

summary.predictions.describe().show()

"""# Evaluation & Test Model"""

from pyspark.ml.evaluation import BinaryClassificationEvaluator
predictions = model.evaluate(test)

predictions.predictions.show(20)

evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Outcome')
evaluator.evaluate(model.transform(test))

model.save("model")

from pyspark.ml.classification import LogisticRegressionModel
model = LogisticRegressionModel.load('model')