diff --git a/openshift/scripts/compare_csv.ipynb b/openshift/scripts/compare_csv.ipynb new file mode 100644 index 000000000..e8a65dc7a --- /dev/null +++ b/openshift/scripts/compare_csv.ipynb @@ -0,0 +1,201 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SQL Statement\n", + "This bit of code can be ran in psql to export the results of a query to a csv file in your postgres container.\n", + "After running it, copy it from your docker container into a local folder\n", + "docker cp container_id:/output.csv ~/Documents/output.csv\n", + "\n", + "Run either sql statement but ensure to update compare_cols in the function to reflect the change. The first one will export data with a model_year column that shows the description from the model_year table ie 2020, 2021. The second one shows an id ie 1, 2, 3.\n", + "\n", + "### -- if starting new use this line and update compare_csvs function below where it references model_year_id\n", + "\\copy ( SELECT icbc_registration_data.id AS registration_id, icbc_vehicle.id AS vehicle_id, model_name, make, model_year.description as model_year, icbc_registration_data.create_timestamp, icbc_registration_data.update_timestamp, vin::TEXT AS vin, icbc_registration_data.icbc_upload_date_id FROM icbc_vehicle JOIN icbc_registration_data ON icbc_vehicle.id = icbc_registration_data.icbc_vehicle_id JOIN model_year on model_year.id = model_year_id) TO 'output.csv' WITH CSV HEADER;\n", + "\n", + "\n", + "### -- old code where it just showed the model_year_id\n", + "\\copy ( SELECT icbc_registration_data.id AS registration_id, icbc_vehicle.id AS vehicle_id, model_name, make, model_year_id, icbc_registration_data.create_timestamp, icbc_registration_data.update_timestamp, vin::TEXT AS vin, icbc_registration_data.icbc_upload_date_id FROM icbc_vehicle JOIN icbc_registration_data ON icbc_vehicle.id = icbc_registration_data.icbc_vehicle_id ) TO 'output.csv' WITH CSV HEADER;\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import pandas as pd\n", + "import openpyxl\n", + "from openpyxl.styles import PatternFill\n", + "\n", + "def compare_csvs(file1, file2, output_file):\n", + " # Read the CSVs into pandas DataFrames\n", + " file_1_df = pd.read_csv(file1, dtype={'vin': str})\n", + " file_2_df = pd.read_csv(file2, dtype={'vin': str})\n", + "\n", + " # Remove accidental quotes if they exist\n", + " file_1_df[\"vin\"] = file_1_df[\"vin\"].str.strip('\"') \n", + " file_2_df[\"vin\"] = file_2_df[\"vin\"].str.strip('\"') \n", + "\n", + " \n", + " #if using the sql query that uses model_year instead of model_year_id use this\n", + " compare_cols = ['vehicle_id', 'model_name', 'make', 'model_year']\n", + " \n", + " # otherwise use this\n", + " # compare_cols = ['vehicle_id', 'model_name', 'make', 'model_year_id']\n", + "\n", + " # Merge the two DataFrames on the 'vin' column (the unique identifier) with an outer join\n", + " merged_df = pd.merge(file_1_df, file_2_df, on=\"vin\", how=\"outer\", suffixes=('_file_1', '_file_2'))\n", + "\n", + " # Print the total number of rows before filtering\n", + " print(f\"Total rows before filtering: {len(merged_df)}\")\n", + "\n", + " # Find VINs that exist in only one file\n", + " vin_only_in_file_1 = merged_df[pd.isna(merged_df['registration_id_file_2'])]['vin']\n", + " vin_only_in_file_2 = merged_df[pd.isna(merged_df['registration_id_file_1'])]['vin']\n", + " \n", + " # Print only unmatched VINs\n", + " print(\"VINs only in File 1 (not in File 2):\")\n", + " print(vin_only_in_file_1.tolist())\n", + " \n", + " print(\"\\nVINs only in File 2 (not in File 1):\")\n", + " print(vin_only_in_file_2.tolist())\n", + " \n", + " # Create a new DataFrame to keep track of differences\n", + " for col in compare_cols:\n", + " file_1_col = col + \"_file_1\"\n", + " file_2_col = col + \"_file_2\"\n", + " diff_col = col + \"_diff\"\n", + " \n", + " # Compare columns and mark differences\n", + " merged_df[diff_col] = merged_df.apply(\n", + " lambda row: 'yellow' if row[file_1_col] != row[file_2_col] else 'green', axis=1\n", + " ).reset_index(drop=True) \n", + "\n", + " # Filter out rows where there are no differences\n", + " filtered_df = merged_df[\n", + " (merged_df.filter(like='_diff') == 'yellow').any(axis=1)\n", + " ]\n", + " print(f\"Total rows after filtering: {len(filtered_df)}\")\n", + " # Remove the '_diff' columns before saving the file\n", + " filtered_df = filtered_df.drop(columns=[col + \"_diff\" for col in compare_cols])\n", + "\n", + " \"\"\" \n", + " Create excel and add headers with filenames\n", + " \"\"\"\n", + " # write to an Excel file\n", + " with pd.ExcelWriter(output_file, engine='openpyxl') as writer:\n", + " filtered_df.to_excel(writer, index=False, sheet_name='Comparison')\n", + "\n", + " wb = openpyxl.load_workbook(output_file)\n", + " sheet = wb['Comparison']\n", + "\n", + " file_1_name = file1.split('/')[-1]\n", + " file_2_name = file2.split('/')[-1]\n", + "\n", + " sheet.insert_rows(1, amount=1) # Insert row at the top\n", + "\n", + " # Merge cells for file names\n", + " sheet.merge_cells(start_row=1, start_column=1, end_row=1, end_column=7) # File 1 header\n", + " sheet.merge_cells(start_row=1, start_column=9, end_row=1, end_column=sheet.max_column) # File 2 header\n", + "\n", + " # Write file names in merged cells\n", + " sheet.cell(row=1, column=1).value = f\"File: {file_1_name}\"\n", + " sheet.cell(row=1, column=9).value = f\"File: {file_2_name}\"\n", + "\n", + " # Make headers bold\n", + " bold_font = openpyxl.styles.Font(size=16, bold=True)\n", + " sheet.cell(row=1, column=1).font = bold_font\n", + " sheet.cell(row=1, column=9).font = bold_font\n", + "\n", + " \"\"\"\n", + " Add color formatting to sheet\n", + " \"\"\"\n", + " # Get original row indices from merged_df before filtering\n", + " filtered_indices = merged_df[\n", + " (merged_df.filter(like='_diff') == 'yellow').any(axis=1)\n", + " ].index.tolist()\n", + "\n", + " # Loop through filtered rows for applying highlighting\n", + " for filtered_idx, row_idx in enumerate(filtered_indices):\n", + " excel_row = filtered_idx + 3 # Excel rows start at 1, +3 for header rows\n", + " \n", + " # Highlight VIN column if it's missing in one file\n", + " vin_col_idx = merged_df.columns.get_loc('vin') + 1\n", + " vin_cell = sheet.cell(row=excel_row, column=vin_col_idx)\n", + " \n", + " # Check if VIN is unique to one of the files\n", + " vin_is_unique = False\n", + " if pd.isna(merged_df.loc[row_idx, 'registration_id_file_1']): # Unique to file_2\n", + " vin_cell.fill = PatternFill(start_color=\"00FF00\", end_color=\"00FF00\", fill_type=\"solid\") # Green\n", + " vin_is_unique = True\n", + " elif pd.isna(merged_df.loc[row_idx, 'registration_id_file_2']): # Unique to file_1\n", + " vin_cell.fill = PatternFill(start_color=\"00FF00\", end_color=\"00FF00\", fill_type=\"solid\") # Green\n", + " vin_is_unique = True\n", + " \n", + " # Apply yellow highlighting for mismatched columns, **skip if VIN is unique**\n", + " if not vin_is_unique:\n", + " for col in compare_cols:\n", + " file_1_col = col + \"_file_1\"\n", + " file_2_col = col + \"_file_2\"\n", + " \n", + " # Ensure columns exist\n", + " if file_1_col not in merged_df.columns or file_2_col not in merged_df.columns:\n", + " continue\n", + " \n", + " col_idx_1 = merged_df.columns.get_loc(file_1_col) + 1\n", + " col_idx_2 = merged_df.columns.get_loc(file_2_col) + 1\n", + " \n", + " # Check if the values in these columns differ\n", + " if merged_df.loc[row_idx, file_1_col] != merged_df.loc[row_idx, file_2_col]:\n", + " # Apply yellow highlighting for differences\n", + " sheet.cell(row=excel_row, column=col_idx_1).fill = PatternFill(start_color=\"FFFF00\", end_color=\"FFFF00\", fill_type=\"solid\") # Yellow\n", + " sheet.cell(row=excel_row, column=col_idx_2).fill = PatternFill(start_color=\"FFFF00\", end_color=\"FFFF00\", fill_type=\"solid\") # Yellow\n", + " wb.save(output_file)\n", + " print('Finished!')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preprocessed_dataset = 'output-preprocessed.csv'\n", + "only_additions = 'output_only_additions.csv'\n", + "compare_csvs(preprocessed_dataset, only_additions, \"only_additions.xlsx\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/openshift/scripts/import-data-from-local.sh b/openshift/scripts/import-data-from-local.sh index b47895389..4157ba9f4 100644 --- a/openshift/scripts/import-data-from-local.sh +++ b/openshift/scripts/import-data-from-local.sh @@ -7,8 +7,7 @@ set -e # 1 Argument = 'local container name or id' # example command -# . ./import-data.sh 398cd4661173 - +# . ./import-data-from-local.sh 398cd4661173 if [ "$#" -ne 1 ]; then echo "Passed $# parameters. Expected 1." exit 1