diff --git a/ML52/.ipynb_checkpoints/ML52-checkpoint.ipynb b/ML52/.ipynb_checkpoints/ML52-checkpoint.ipynb
new file mode 100644
index 0000000..28f89f0
--- /dev/null
+++ b/ML52/.ipynb_checkpoints/ML52-checkpoint.ipynb
@@ -0,0 +1,530 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " First Name | \n",
+ " Gender | \n",
+ " Start Date | \n",
+ " Last Login Time | \n",
+ " Salary | \n",
+ " Bonus % | \n",
+ " Senior Management | \n",
+ " Team | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Douglas | \n",
+ " Male | \n",
+ " 8/6/1993 | \n",
+ " 12:42 PM | \n",
+ " 97308 | \n",
+ " 6.945 | \n",
+ " True | \n",
+ " Marketing | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Thomas | \n",
+ " Male | \n",
+ " 3/31/1996 | \n",
+ " 6:53 AM | \n",
+ " 61933 | \n",
+ " 4.170 | \n",
+ " True | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Maria | \n",
+ " Female | \n",
+ " 4/23/1993 | \n",
+ " 11:17 AM | \n",
+ " 130590 | \n",
+ " 11.858 | \n",
+ " False | \n",
+ " Finance | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Jerry | \n",
+ " Male | \n",
+ " 3/4/2005 | \n",
+ " 1:00 PM | \n",
+ " 138705 | \n",
+ " 9.340 | \n",
+ " True | \n",
+ " Finance | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Larry | \n",
+ " Male | \n",
+ " 1/24/1998 | \n",
+ " 4:47 PM | \n",
+ " 101004 | \n",
+ " 1.389 | \n",
+ " True | \n",
+ " Client Services | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " First Name Gender Start Date Last Login Time Salary Bonus % \\\n",
+ "0 Douglas Male 8/6/1993 12:42 PM 97308 6.945 \n",
+ "1 Thomas Male 3/31/1996 6:53 AM 61933 4.170 \n",
+ "2 Maria Female 4/23/1993 11:17 AM 130590 11.858 \n",
+ "3 Jerry Male 3/4/2005 1:00 PM 138705 9.340 \n",
+ "4 Larry Male 1/24/1998 4:47 PM 101004 1.389 \n",
+ "\n",
+ " Senior Management Team \n",
+ "0 True Marketing \n",
+ "1 True NaN \n",
+ "2 False Finance \n",
+ "3 True Finance \n",
+ "4 True Client Services "
+ ]
+ },
+ "execution_count": 1,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.read_csv(\"employees.csv\")\n",
+ "\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 1000 entries, 0 to 999\n",
+ "Data columns (total 8 columns):\n",
+ "First Name 933 non-null object\n",
+ "Gender 855 non-null object\n",
+ "Start Date 1000 non-null object\n",
+ "Last Login Time 1000 non-null object\n",
+ "Salary 1000 non-null int64\n",
+ "Bonus % 1000 non-null float64\n",
+ "Senior Management 933 non-null object\n",
+ "Team 957 non-null object\n",
+ "dtypes: float64(1), int64(1), object(6)\n",
+ "memory usage: 62.6+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "First Name 67\n",
+ "Gender 145\n",
+ "Start Date 0\n",
+ "Last Login Time 0\n",
+ "Salary 0\n",
+ "Bonus % 0\n",
+ "Senior Management 67\n",
+ "Team 43\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df[df['First Name'].notna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Female 400\n",
+ "Male 395\n",
+ "Name: Gender, dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['Gender'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['Gender']=df['Gender'].fillna(method='ffill')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['Team']=df['Team'].fillna(method='ffill')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 8/6/1993\n",
+ "1 3/31/1996\n",
+ "2 4/23/1993\n",
+ "3 3/4/2005\n",
+ "4 1/24/1998\n",
+ "Name: Start Date, dtype: object"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[\"Start Date\"].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.to_datetime(df[\"Start Date\"])\n",
+ "\n",
+ "df[\"Start Date\"] = pd.to_datetime(df[\"Start Date\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 12:42 PM\n",
+ "1 6:53 AM\n",
+ "2 11:17 AM\n",
+ "3 1:00 PM\n",
+ "4 4:47 PM\n",
+ "Name: Last Login Time, dtype: object"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[\"Last Login Time\"].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pd.to_datetime(df[\"Last Login Time\"])\n",
+ "df[\"Last Login Time\"]= pd.to_datetime(df[\"Last Login Time\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 True\n",
+ "1 True\n",
+ "2 False\n",
+ "3 True\n",
+ "4 True\n",
+ "Name: Senior Management, dtype: object"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[\"Senior Management\"].head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df[\"Senior Management\"].astype(\"bool\")\n",
+ "\n",
+ "df[\"Senior Management\"] = df[\"Senior Management\"].astype(\"bool\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df[\"Gender\"].astype(\"category\")\n",
+ "\n",
+ "df[\"Gender\"] = df[\"Gender\"].astype(\"category\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " First Name | \n",
+ " Gender | \n",
+ " Start Date | \n",
+ " Last Login Time | \n",
+ " Salary | \n",
+ " Bonus % | \n",
+ " Senior Management | \n",
+ " Team | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " Douglas | \n",
+ " Male | \n",
+ " 1993-08-06 | \n",
+ " 2018-12-23 12:42:00 | \n",
+ " 97308 | \n",
+ " 6.945 | \n",
+ " True | \n",
+ " Marketing | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " Thomas | \n",
+ " Male | \n",
+ " 1996-03-31 | \n",
+ " 2018-12-23 06:53:00 | \n",
+ " 61933 | \n",
+ " 4.170 | \n",
+ " True | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " Maria | \n",
+ " Female | \n",
+ " 1993-04-23 | \n",
+ " 2018-12-23 11:17:00 | \n",
+ " 130590 | \n",
+ " 11.858 | \n",
+ " False | \n",
+ " Finance | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " Jerry | \n",
+ " Male | \n",
+ " 2005-03-04 | \n",
+ " 2018-12-23 13:00:00 | \n",
+ " 138705 | \n",
+ " 9.340 | \n",
+ " True | \n",
+ " Finance | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " Larry | \n",
+ " Male | \n",
+ " 1998-01-24 | \n",
+ " 2018-12-23 16:47:00 | \n",
+ " 101004 | \n",
+ " 1.389 | \n",
+ " True | \n",
+ " Client Services | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " First Name Gender Start Date Last Login Time Salary Bonus % \\\n",
+ "0 Douglas Male 1993-08-06 2018-12-23 12:42:00 97308 6.945 \n",
+ "1 Thomas Male 1996-03-31 2018-12-23 06:53:00 61933 4.170 \n",
+ "2 Maria Female 1993-04-23 2018-12-23 11:17:00 130590 11.858 \n",
+ "3 Jerry Male 2005-03-04 2018-12-23 13:00:00 138705 9.340 \n",
+ "4 Larry Male 1998-01-24 2018-12-23 16:47:00 101004 1.389 \n",
+ "\n",
+ " Senior Management Team \n",
+ "0 True Marketing \n",
+ "1 True NaN \n",
+ "2 False Finance \n",
+ "3 True Finance \n",
+ "4 True Client Services "
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "df = pd.read_csv(\"employees.csv\", parse_dates=[\"Start Date\",\"Last Login Time\"])\n",
+ "df[\"Senior Management\"] = df[\"Senior Management\"].astype(\"bool\")\n",
+ "df[\"Gender\"] = df[\"Gender\"].astype(\"category\")\n",
+ "\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 1000 entries, 0 to 999\n",
+ "Data columns (total 8 columns):\n",
+ "First Name 933 non-null object\n",
+ "Gender 855 non-null category\n",
+ "Start Date 1000 non-null datetime64[ns]\n",
+ "Last Login Time 1000 non-null datetime64[ns]\n",
+ "Salary 1000 non-null int64\n",
+ "Bonus % 1000 non-null float64\n",
+ "Senior Management 1000 non-null bool\n",
+ "Team 957 non-null object\n",
+ "dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)\n",
+ "memory usage: 49.0+ KB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/ML52/ML52.ipynb b/ML52/ML52.ipynb
index 6e9cecb..28f89f0 100644
--- a/ML52/ML52.ipynb
+++ b/ML52/ML52.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
@@ -38,7 +38,7 @@
" \n",
" \n",
" \n",
- " | 0 | \n",
+ " 0 | \n",
" Douglas | \n",
" Male | \n",
" 8/6/1993 | \n",
@@ -49,7 +49,7 @@
" Marketing | \n",
"
\n",
" \n",
- " | 1 | \n",
+ " 1 | \n",
" Thomas | \n",
" Male | \n",
" 3/31/1996 | \n",
@@ -60,7 +60,7 @@
" NaN | \n",
"
\n",
" \n",
- " | 2 | \n",
+ " 2 | \n",
" Maria | \n",
" Female | \n",
" 4/23/1993 | \n",
@@ -71,7 +71,7 @@
" Finance | \n",
"
\n",
" \n",
- " | 3 | \n",
+ " 3 | \n",
" Jerry | \n",
" Male | \n",
" 3/4/2005 | \n",
@@ -82,7 +82,7 @@
" Finance | \n",
"
\n",
" \n",
- " | 4 | \n",
+ " 4 | \n",
" Larry | \n",
" Male | \n",
" 1/24/1998 | \n",
@@ -112,7 +112,7 @@
"4 True Client Services "
]
},
- "execution_count": 4,
+ "execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
@@ -127,7 +127,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 2,
"metadata": {},
"outputs": [
{
@@ -138,15 +138,15 @@
"RangeIndex: 1000 entries, 0 to 999\n",
"Data columns (total 8 columns):\n",
"First Name 933 non-null object\n",
- "Gender 855 non-null category\n",
- "Start Date 1000 non-null datetime64[ns]\n",
- "Last Login Time 1000 non-null datetime64[ns]\n",
+ "Gender 855 non-null object\n",
+ "Start Date 1000 non-null object\n",
+ "Last Login Time 1000 non-null object\n",
"Salary 1000 non-null int64\n",
"Bonus % 1000 non-null float64\n",
- "Senior Management 1000 non-null bool\n",
+ "Senior Management 933 non-null object\n",
"Team 957 non-null object\n",
- "dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)\n",
- "memory usage: 49.0+ KB\n"
+ "dtypes: float64(1), int64(1), object(6)\n",
+ "memory usage: 62.6+ KB\n"
]
}
],
@@ -156,30 +156,80 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "0.7877813504823151"
+ "First Name 67\n",
+ "Gender 145\n",
+ "Start Date 0\n",
+ "Last Login Time 0\n",
+ "Salary 0\n",
+ "Bonus % 0\n",
+ "Senior Management 67\n",
+ "Team 43\n",
+ "dtype: int64"
]
},
- "execution_count": 21,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "49.0/62.2"
+ "df.isnull().sum()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "df = df[df['First Name'].notna()]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Female 400\n",
+ "Male 395\n",
+ "Name: Gender, dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df['Gender'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['Gender']=df['Gender'].fillna(method='ffill')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df['Team']=df['Team'].fillna(method='ffill')"
+ ]
},
{
"cell_type": "code",
@@ -472,7 +522,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.1"
+ "version": "3.7.4"
}
},
"nbformat": 4,