diff --git a/ML52/.ipynb_checkpoints/ML52-checkpoint.ipynb b/ML52/.ipynb_checkpoints/ML52-checkpoint.ipynb new file mode 100644 index 0000000..28f89f0 --- /dev/null +++ b/ML52/.ipynb_checkpoints/ML52-checkpoint.ipynb @@ -0,0 +1,530 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
First NameGenderStart DateLast Login TimeSalaryBonus %Senior ManagementTeam
0DouglasMale8/6/199312:42 PM973086.945TrueMarketing
1ThomasMale3/31/19966:53 AM619334.170TrueNaN
2MariaFemale4/23/199311:17 AM13059011.858FalseFinance
3JerryMale3/4/20051:00 PM1387059.340TrueFinance
4LarryMale1/24/19984:47 PM1010041.389TrueClient Services
\n", + "
" + ], + "text/plain": [ + " First Name Gender Start Date Last Login Time Salary Bonus % \\\n", + "0 Douglas Male 8/6/1993 12:42 PM 97308 6.945 \n", + "1 Thomas Male 3/31/1996 6:53 AM 61933 4.170 \n", + "2 Maria Female 4/23/1993 11:17 AM 130590 11.858 \n", + "3 Jerry Male 3/4/2005 1:00 PM 138705 9.340 \n", + "4 Larry Male 1/24/1998 4:47 PM 101004 1.389 \n", + "\n", + " Senior Management Team \n", + "0 True Marketing \n", + "1 True NaN \n", + "2 False Finance \n", + "3 True Finance \n", + "4 True Client Services " + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"employees.csv\")\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1000 entries, 0 to 999\n", + "Data columns (total 8 columns):\n", + "First Name 933 non-null object\n", + "Gender 855 non-null object\n", + "Start Date 1000 non-null object\n", + "Last Login Time 1000 non-null object\n", + "Salary 1000 non-null int64\n", + "Bonus % 1000 non-null float64\n", + "Senior Management 933 non-null object\n", + "Team 957 non-null object\n", + "dtypes: float64(1), int64(1), object(6)\n", + "memory usage: 62.6+ KB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "First Name 67\n", + "Gender 145\n", + "Start Date 0\n", + "Last Login Time 0\n", + "Salary 0\n", + "Bonus % 0\n", + "Senior Management 67\n", + "Team 43\n", + "dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df['First Name'].notna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Female 400\n", + "Male 395\n", + "Name: Gender, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Gender'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df['Gender']=df['Gender'].fillna(method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df['Team']=df['Team'].fillna(method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 8/6/1993\n", + "1 3/31/1996\n", + "2 4/23/1993\n", + "3 3/4/2005\n", + "4 1/24/1998\n", + "Name: Start Date, dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Start Date\"].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(df[\"Start Date\"])\n", + "\n", + "df[\"Start Date\"] = pd.to_datetime(df[\"Start Date\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 12:42 PM\n", + "1 6:53 AM\n", + "2 11:17 AM\n", + "3 1:00 PM\n", + "4 4:47 PM\n", + "Name: Last Login Time, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Last Login Time\"].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "pd.to_datetime(df[\"Last Login Time\"])\n", + "df[\"Last Login Time\"]= pd.to_datetime(df[\"Last Login Time\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 True\n", + "1 True\n", + "2 False\n", + "3 True\n", + "4 True\n", + "Name: Senior Management, dtype: object" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Senior Management\"].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"Senior Management\"].astype(\"bool\")\n", + "\n", + "df[\"Senior Management\"] = df[\"Senior Management\"].astype(\"bool\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"Gender\"].astype(\"category\")\n", + "\n", + "df[\"Gender\"] = df[\"Gender\"].astype(\"category\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
First NameGenderStart DateLast Login TimeSalaryBonus %Senior ManagementTeam
0DouglasMale1993-08-062018-12-23 12:42:00973086.945TrueMarketing
1ThomasMale1996-03-312018-12-23 06:53:00619334.170TrueNaN
2MariaFemale1993-04-232018-12-23 11:17:0013059011.858FalseFinance
3JerryMale2005-03-042018-12-23 13:00:001387059.340TrueFinance
4LarryMale1998-01-242018-12-23 16:47:001010041.389TrueClient Services
\n", + "
" + ], + "text/plain": [ + " First Name Gender Start Date Last Login Time Salary Bonus % \\\n", + "0 Douglas Male 1993-08-06 2018-12-23 12:42:00 97308 6.945 \n", + "1 Thomas Male 1996-03-31 2018-12-23 06:53:00 61933 4.170 \n", + "2 Maria Female 1993-04-23 2018-12-23 11:17:00 130590 11.858 \n", + "3 Jerry Male 2005-03-04 2018-12-23 13:00:00 138705 9.340 \n", + "4 Larry Male 1998-01-24 2018-12-23 16:47:00 101004 1.389 \n", + "\n", + " Senior Management Team \n", + "0 True Marketing \n", + "1 True NaN \n", + "2 False Finance \n", + "3 True Finance \n", + "4 True Client Services " + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"employees.csv\", parse_dates=[\"Start Date\",\"Last Login Time\"])\n", + "df[\"Senior Management\"] = df[\"Senior Management\"].astype(\"bool\")\n", + "df[\"Gender\"] = df[\"Gender\"].astype(\"category\")\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1000 entries, 0 to 999\n", + "Data columns (total 8 columns):\n", + "First Name 933 non-null object\n", + "Gender 855 non-null category\n", + "Start Date 1000 non-null datetime64[ns]\n", + "Last Login Time 1000 non-null datetime64[ns]\n", + "Salary 1000 non-null int64\n", + "Bonus % 1000 non-null float64\n", + "Senior Management 1000 non-null bool\n", + "Team 957 non-null object\n", + "dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)\n", + "memory usage: 49.0+ KB\n" + ] + } + ], + "source": [ + "df.info()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/ML52/ML52.ipynb b/ML52/ML52.ipynb index 6e9cecb..28f89f0 100644 --- a/ML52/ML52.ipynb +++ b/ML52/ML52.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -38,7 +38,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " Douglas\n", " Male\n", " 8/6/1993\n", @@ -49,7 +49,7 @@ " Marketing\n", " \n", " \n", - " 1\n", + " 1\n", " Thomas\n", " Male\n", " 3/31/1996\n", @@ -60,7 +60,7 @@ " NaN\n", " \n", " \n", - " 2\n", + " 2\n", " Maria\n", " Female\n", " 4/23/1993\n", @@ -71,7 +71,7 @@ " Finance\n", " \n", " \n", - " 3\n", + " 3\n", " Jerry\n", " Male\n", " 3/4/2005\n", @@ -82,7 +82,7 @@ " Finance\n", " \n", " \n", - " 4\n", + " 4\n", " Larry\n", " Male\n", " 1/24/1998\n", @@ -112,7 +112,7 @@ "4 True Client Services " ] }, - "execution_count": 4, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -127,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -138,15 +138,15 @@ "RangeIndex: 1000 entries, 0 to 999\n", "Data columns (total 8 columns):\n", "First Name 933 non-null object\n", - "Gender 855 non-null category\n", - "Start Date 1000 non-null datetime64[ns]\n", - "Last Login Time 1000 non-null datetime64[ns]\n", + "Gender 855 non-null object\n", + "Start Date 1000 non-null object\n", + "Last Login Time 1000 non-null object\n", "Salary 1000 non-null int64\n", "Bonus % 1000 non-null float64\n", - "Senior Management 1000 non-null bool\n", + "Senior Management 933 non-null object\n", "Team 957 non-null object\n", - "dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)\n", - "memory usage: 49.0+ KB\n" + "dtypes: float64(1), int64(1), object(6)\n", + "memory usage: 62.6+ KB\n" ] } ], @@ -156,30 +156,80 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.7877813504823151" + "First Name 67\n", + "Gender 145\n", + "Start Date 0\n", + "Last Login Time 0\n", + "Salary 0\n", + "Bonus % 0\n", + "Senior Management 67\n", + "Team 43\n", + "dtype: int64" ] }, - "execution_count": 21, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "49.0/62.2" + "df.isnull().sum()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "df = df[df['First Name'].notna()]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Female 400\n", + "Male 395\n", + "Name: Gender, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Gender'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df['Gender']=df['Gender'].fillna(method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df['Team']=df['Team'].fillna(method='ffill')" + ] }, { "cell_type": "code", @@ -472,7 +522,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.7.4" } }, "nbformat": 4,