From 224cd39e0df18f989e536e470ef06de7ccbc20b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ant=C3=B3nio=20Gomes?= Date: Sun, 19 Oct 2025 19:39:18 +0200 Subject: [PATCH] Done With The Lab Data Filtering --- lab-dw-aggregating.ipynb | 202 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 201 insertions(+), 1 deletion(-) diff --git a/lab-dw-aggregating.ipynb b/lab-dw-aggregating.ipynb index fadd718..87c7583 100644 --- a/lab-dw-aggregating.ipynb +++ b/lab-dw-aggregating.ipynb @@ -134,7 +134,207 @@ }, "outputs": [], "source": [ - "# your code goes here" + "# your code goes here\n", + "import pandas as pd\n", + "\n", + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis.csv\")\n", + "df.head()\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db998b3d", + "metadata": {}, + "outputs": [], + "source": [ + "df.columns = df.columns.str.lower()\n", + "\n", + "df.columns = df.columns.str.replace(\" \", \"_\")\n", + "\n", + "df = df.drop(\"unnamed:_0\", axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e36e425b", + "metadata": {}, + "outputs": [], + "source": [ + "df.drop_duplicates(subset = [\"customer\"], inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "579503e4", + "metadata": {}, + "outputs": [], + "source": [ + "df.duplicated().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "81f91121", + "metadata": {}, + "outputs": [], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b492f159", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"state\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b93581cc", + "metadata": {}, + "outputs": [], + "source": [ + "df['state'] = df['state'].fillna(df['state'].mode()[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23dda840", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"state\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4100f7e", + "metadata": {}, + "outputs": [], + "source": [ + "df['response'] = df['response'].fillna(df['response'].mode()[0])\n", + "df['vehicle_class'] = df['vehicle_class'].fillna(df['vehicle_class'].mode()[0])\n", + "df['vehicle_size'] = df['vehicle_size'].fillna(df['vehicle_size'].mode()[0])\n", + "\n", + "df['months_since_last_claim'] = df['months_since_last_claim'].fillna(df['months_since_last_claim'].median())\n", + "\n", + "df['number_of_open_complaints'] = df['number_of_open_complaints'].fillna(0)\n", + "\n", + "df = df.dropna(subset = [\"vehicle_type\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "051109b5", + "metadata": {}, + "outputs": [], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0ffda1c", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"number_of_open_complaints\"] = df[\"number_of_open_complaints\"].astype(int)\n", + "df[\"months_since_last_claim\"] = df[\"months_since_last_claim\"].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65633d50", + "metadata": {}, + "outputs": [], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c436343", + "metadata": {}, + "outputs": [], + "source": [ + "dfLow = df[(df['total_claim_amount'] < 1000) & (df['response'] == 'Yes')]\n", + "dfLow.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9da5b059", + "metadata": {}, + "outputs": [], + "source": [ + "dfYes = df[(df[\"response\"] == \"Yes\")]\n", + "\n", + "df_1 = dfYes.pivot_table(index=[\"policy_type\", \"gender\"], values=[\"monthly_premium_auto\", \"customer_lifetime_value\", \"total_claim_amount\"], aggfunc= \"mean\")\n", + "\n", + "df_1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0ba29b4", + "metadata": {}, + "outputs": [], + "source": [ + "#Males with Special Auto are the most profitable and low risk segment, showing high lifetime value and low claims. \n", + "#Gender differences are small except in Special Auto." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a92df366", + "metadata": {}, + "outputs": [], + "source": [ + "dfStCounts = df[\"state\"].value_counts()\n", + "\n", + "dfBigSt = dfStCounts < 500\n", + "\n", + "dfBigSt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d40e0a36", + "metadata": {}, + "outputs": [], + "source": [ + "df_2 = df.pivot_table(index = [\"education\", \"gender\"], values = \"customer_lifetime_value\", aggfunc = [\"max\", \"min\", \"median\"])\n", + "df_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca37e533", + "metadata": {}, + "outputs": [], + "source": [ + "#CLV varies widely within each education gender group, but medians are similar, which shows that gender has little impact. High School\n", + "#or Below and Master's customers have the highest median CLVs, while Doctorates are lower. Education influences CLV more than gender." ] } ],