bcgov · emi-hi · Mar 31, 2025 · Mar 31, 2025
diff --git a/openshift/scripts/compare_csv.ipynb b/openshift/scripts/compare_csv.ipynb
@@ -0,0 +1,201 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## SQL Statement\n",
+    "This bit of code can be ran in psql to export the results of a query to a csv file in your postgres container.\n",
+    "After running it, copy it from your docker container into a local folder\n",
+    "docker cp container_id:/output.csv ~/Documents/output.csv\n",
+    "\n",
+    "Run either sql statement but ensure to update compare_cols in the function to reflect the change. The first one will export data with a model_year column that shows the description from the model_year table ie 2020, 2021. The second one shows an id ie 1, 2, 3.\n",
+    "\n",
+    "### -- if starting new use this line and update compare_csvs function below where it references model_year_id\n",
+    "\\copy ( SELECT  icbc_registration_data.id AS registration_id, icbc_vehicle.id AS vehicle_id, model_name, make, model_year.description as model_year, icbc_registration_data.create_timestamp, icbc_registration_data.update_timestamp, vin::TEXT AS vin, icbc_registration_data.icbc_upload_date_id  FROM icbc_vehicle JOIN icbc_registration_data  ON icbc_vehicle.id = icbc_registration_data.icbc_vehicle_id JOIN model_year on model_year.id = model_year_id) TO 'output.csv' WITH CSV HEADER;\n",
+    "\n",
+    "\n",
+    "### -- old code where it just showed the model_year_id\n",
+    "\\copy ( SELECT  icbc_registration_data.id AS registration_id, icbc_vehicle.id AS vehicle_id, model_name, make, model_year_id, icbc_registration_data.create_timestamp, icbc_registration_data.update_timestamp, vin::TEXT AS vin, icbc_registration_data.icbc_upload_date_id  FROM icbc_vehicle JOIN icbc_registration_data  ON icbc_vehicle.id = icbc_registration_data.icbc_vehicle_id ) TO 'output.csv' WITH CSV HEADER;\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import pandas as pd\n",
+    "import openpyxl\n",
+    "from openpyxl.styles import PatternFill\n",
+    "\n",
+    "def compare_csvs(file1, file2, output_file):\n",
+    "    # Read the CSVs into pandas DataFrames\n",
+    "    file_1_df = pd.read_csv(file1, dtype={'vin': str})\n",
+    "    file_2_df = pd.read_csv(file2, dtype={'vin': str})\n",
+    "\n",
+    "    # Remove accidental quotes if they exist\n",
+    "    file_1_df[\"vin\"] = file_1_df[\"vin\"].str.strip('\"') \n",
+    "    file_2_df[\"vin\"] = file_2_df[\"vin\"].str.strip('\"') \n",
+    "\n",
+    "  \n",
+    "    #if using the sql query that uses model_year instead of model_year_id use this\n",
+    "    compare_cols = ['vehicle_id', 'model_name', 'make', 'model_year']\n",
+    "    \n",
+    "    # otherwise use this\n",
+    "    # compare_cols = ['vehicle_id', 'model_name', 'make', 'model_year_id']\n",
+    "\n",
+    "    # Merge the two DataFrames on the 'vin' column (the unique identifier) with an outer join\n",
+    "    merged_df = pd.merge(file_1_df, file_2_df, on=\"vin\", how=\"outer\", suffixes=('_file_1', '_file_2'))\n",
+    "\n",
+    "    # Print the total number of rows before filtering\n",
+    "    print(f\"Total rows before filtering: {len(merged_df)}\")\n",
+    "\n",
+    "    # Find VINs that exist in only one file\n",
+    "    vin_only_in_file_1 = merged_df[pd.isna(merged_df['registration_id_file_2'])]['vin']\n",
+    "    vin_only_in_file_2 = merged_df[pd.isna(merged_df['registration_id_file_1'])]['vin']\n",
+    "    \n",
+    "    # Print only unmatched VINs\n",
+    "    print(\"VINs only in File 1 (not in File 2):\")\n",
+    "    print(vin_only_in_file_1.tolist())\n",
+    "    \n",
+    "    print(\"\\nVINs only in File 2 (not in File 1):\")\n",
+    "    print(vin_only_in_file_2.tolist())\n",
+    "    \n",
+    "    # Create a new DataFrame to keep track of differences\n",
+    "    for col in compare_cols:\n",
+    "        file_1_col = col + \"_file_1\"\n",
+    "        file_2_col = col + \"_file_2\"\n",
+    "        diff_col = col + \"_diff\"\n",
+    "        \n",
+    "        # Compare columns and mark differences\n",
+    "        merged_df[diff_col] = merged_df.apply(\n",
+    "            lambda row: 'yellow' if row[file_1_col] != row[file_2_col] else 'green', axis=1\n",
+    "        ).reset_index(drop=True) \n",
+    "\n",
+    "    # Filter out rows where there are no differences\n",
+    "    filtered_df = merged_df[\n",
+    "        (merged_df.filter(like='_diff') == 'yellow').any(axis=1)\n",
+    "    ]\n",
+    "    print(f\"Total rows after filtering: {len(filtered_df)}\")\n",
+    "    # Remove the '_diff' columns before saving the file\n",
+    "    filtered_df = filtered_df.drop(columns=[col + \"_diff\" for col in compare_cols])\n",
+    "\n",
+    "    \"\"\" \n",
+    "    Create excel and add headers with filenames\n",
+    "    \"\"\"\n",
+    "    # write to an Excel file\n",
+    "    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:\n",
+    "        filtered_df.to_excel(writer, index=False, sheet_name='Comparison')\n",
+    "\n",
+    "    wb = openpyxl.load_workbook(output_file)\n",
+    "    sheet = wb['Comparison']\n",
+    "\n",
+    "    file_1_name = file1.split('/')[-1]\n",
+    "    file_2_name = file2.split('/')[-1]\n",
+    "\n",
+    "    sheet.insert_rows(1, amount=1)  # Insert row at the top\n",
+    "\n",
+    "    # Merge cells for file names\n",
+    "    sheet.merge_cells(start_row=1, start_column=1, end_row=1, end_column=7)  # File 1 header\n",
+    "    sheet.merge_cells(start_row=1, start_column=9, end_row=1, end_column=sheet.max_column)  # File 2 header\n",
+    "\n",
+    "    # Write file names in merged cells\n",
+    "    sheet.cell(row=1, column=1).value = f\"File: {file_1_name}\"\n",
+    "    sheet.cell(row=1, column=9).value = f\"File: {file_2_name}\"\n",
+    "\n",
+    "    # Make headers bold\n",
+    "    bold_font = openpyxl.styles.Font(size=16, bold=True)\n",
+    "    sheet.cell(row=1, column=1).font = bold_font\n",
+    "    sheet.cell(row=1, column=9).font = bold_font\n",
+    "\n",
+    "    \"\"\"\n",
+    "    Add color formatting to sheet\n",
+    "    \"\"\"\n",
+    "    # Get original row indices from merged_df before filtering\n",
+    "    filtered_indices = merged_df[\n",
+    "        (merged_df.filter(like='_diff') == 'yellow').any(axis=1)\n",
+    "    ].index.tolist()\n",
+    "\n",
+    "    # Loop through filtered rows for applying highlighting\n",
+    "    for filtered_idx, row_idx in enumerate(filtered_indices):\n",
+    "        excel_row = filtered_idx + 3  # Excel rows start at 1, +3 for header rows\n",
+    "    \n",
+    "        # Highlight VIN column if it's missing in one file\n",
+    "        vin_col_idx = merged_df.columns.get_loc('vin') + 1\n",
+    "        vin_cell = sheet.cell(row=excel_row, column=vin_col_idx)\n",
+    "    \n",
+    "        # Check if VIN is unique to one of the files\n",
+    "        vin_is_unique = False\n",
+    "        if pd.isna(merged_df.loc[row_idx, 'registration_id_file_1']):  # Unique to file_2\n",
+    "            vin_cell.fill = PatternFill(start_color=\"00FF00\", end_color=\"00FF00\", fill_type=\"solid\")  # Green\n",
+    "            vin_is_unique = True\n",
+    "        elif pd.isna(merged_df.loc[row_idx, 'registration_id_file_2']):  # Unique to file_1\n",
+    "            vin_cell.fill = PatternFill(start_color=\"00FF00\", end_color=\"00FF00\", fill_type=\"solid\")  # Green\n",
+    "            vin_is_unique = True\n",
+    "    \n",
+    "        # Apply yellow highlighting for mismatched columns, **skip if VIN is unique**\n",
+    "        if not vin_is_unique:\n",
+    "            for col in compare_cols:\n",
+    "                file_1_col = col + \"_file_1\"\n",
+    "                file_2_col = col + \"_file_2\"\n",
+    "    \n",
+    "                # Ensure columns exist\n",
+    "                if file_1_col not in merged_df.columns or file_2_col not in merged_df.columns:\n",
+    "                    continue\n",
+    "                \n",
+    "                col_idx_1 = merged_df.columns.get_loc(file_1_col) + 1\n",
+    "                col_idx_2 = merged_df.columns.get_loc(file_2_col) + 1\n",
+    "    \n",
+    "                # Check if the values in these columns differ\n",
+    "                if merged_df.loc[row_idx, file_1_col] != merged_df.loc[row_idx, file_2_col]:\n",
+    "                    # Apply yellow highlighting for differences\n",
+    "                    sheet.cell(row=excel_row, column=col_idx_1).fill = PatternFill(start_color=\"FFFF00\", end_color=\"FFFF00\", fill_type=\"solid\")  # Yellow\n",
+    "                    sheet.cell(row=excel_row, column=col_idx_2).fill = PatternFill(start_color=\"FFFF00\", end_color=\"FFFF00\", fill_type=\"solid\")  # Yellow\n",
+    "    wb.save(output_file)\n",
+    "    print('Finished!')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocessed_dataset = 'output-preprocessed.csv'\n",
+    "only_additions = 'output_only_additions.csv'\n",
+    "compare_csvs(preprocessed_dataset, only_additions, \"only_additions.xlsx\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/openshift/scripts/import-data-from-local.sh b/openshift/scripts/import-data-from-local.sh
@@ -7,8 +7,7 @@ set -e
 
 # 1 Argument  = 'local container name or id'
 # example command
-# . ./import-data.sh 398cd4661173
-
+# . ./import-data-from-local.sh 398cd4661173
 if [ "$#" -ne 1 ]; then
     echo "Passed $# parameters. Expected 1."
     exit 1