Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 201 additions & 0 deletions openshift/scripts/compare_csv.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## SQL Statement\n",
"This bit of code can be ran in psql to export the results of a query to a csv file in your postgres container.\n",
"After running it, copy it from your docker container into a local folder\n",
"docker cp container_id:/output.csv ~/Documents/output.csv\n",
"\n",
"Run either sql statement but ensure to update compare_cols in the function to reflect the change. The first one will export data with a model_year column that shows the description from the model_year table ie 2020, 2021. The second one shows an id ie 1, 2, 3.\n",
"\n",
"### -- if starting new use this line and update compare_csvs function below where it references model_year_id\n",
"\\copy ( SELECT icbc_registration_data.id AS registration_id, icbc_vehicle.id AS vehicle_id, model_name, make, model_year.description as model_year, icbc_registration_data.create_timestamp, icbc_registration_data.update_timestamp, vin::TEXT AS vin, icbc_registration_data.icbc_upload_date_id FROM icbc_vehicle JOIN icbc_registration_data ON icbc_vehicle.id = icbc_registration_data.icbc_vehicle_id JOIN model_year on model_year.id = model_year_id) TO 'output.csv' WITH CSV HEADER;\n",
"\n",
"\n",
"### -- old code where it just showed the model_year_id\n",
"\\copy ( SELECT icbc_registration_data.id AS registration_id, icbc_vehicle.id AS vehicle_id, model_name, make, model_year_id, icbc_registration_data.create_timestamp, icbc_registration_data.update_timestamp, vin::TEXT AS vin, icbc_registration_data.icbc_upload_date_id FROM icbc_vehicle JOIN icbc_registration_data ON icbc_vehicle.id = icbc_registration_data.icbc_vehicle_id ) TO 'output.csv' WITH CSV HEADER;\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"import pandas as pd\n",
"import openpyxl\n",
"from openpyxl.styles import PatternFill\n",
"\n",
"def compare_csvs(file1, file2, output_file):\n",
" # Read the CSVs into pandas DataFrames\n",
" file_1_df = pd.read_csv(file1, dtype={'vin': str})\n",
" file_2_df = pd.read_csv(file2, dtype={'vin': str})\n",
"\n",
" # Remove accidental quotes if they exist\n",
" file_1_df[\"vin\"] = file_1_df[\"vin\"].str.strip('\"') \n",
" file_2_df[\"vin\"] = file_2_df[\"vin\"].str.strip('\"') \n",
"\n",
" \n",
" #if using the sql query that uses model_year instead of model_year_id use this\n",
" compare_cols = ['vehicle_id', 'model_name', 'make', 'model_year']\n",
" \n",
" # otherwise use this\n",
" # compare_cols = ['vehicle_id', 'model_name', 'make', 'model_year_id']\n",
"\n",
" # Merge the two DataFrames on the 'vin' column (the unique identifier) with an outer join\n",
" merged_df = pd.merge(file_1_df, file_2_df, on=\"vin\", how=\"outer\", suffixes=('_file_1', '_file_2'))\n",
"\n",
" # Print the total number of rows before filtering\n",
" print(f\"Total rows before filtering: {len(merged_df)}\")\n",
"\n",
" # Find VINs that exist in only one file\n",
" vin_only_in_file_1 = merged_df[pd.isna(merged_df['registration_id_file_2'])]['vin']\n",
" vin_only_in_file_2 = merged_df[pd.isna(merged_df['registration_id_file_1'])]['vin']\n",
" \n",
" # Print only unmatched VINs\n",
" print(\"VINs only in File 1 (not in File 2):\")\n",
" print(vin_only_in_file_1.tolist())\n",
" \n",
" print(\"\\nVINs only in File 2 (not in File 1):\")\n",
" print(vin_only_in_file_2.tolist())\n",
" \n",
" # Create a new DataFrame to keep track of differences\n",
" for col in compare_cols:\n",
" file_1_col = col + \"_file_1\"\n",
" file_2_col = col + \"_file_2\"\n",
" diff_col = col + \"_diff\"\n",
" \n",
" # Compare columns and mark differences\n",
" merged_df[diff_col] = merged_df.apply(\n",
" lambda row: 'yellow' if row[file_1_col] != row[file_2_col] else 'green', axis=1\n",
" ).reset_index(drop=True) \n",
"\n",
" # Filter out rows where there are no differences\n",
" filtered_df = merged_df[\n",
" (merged_df.filter(like='_diff') == 'yellow').any(axis=1)\n",
" ]\n",
" print(f\"Total rows after filtering: {len(filtered_df)}\")\n",
" # Remove the '_diff' columns before saving the file\n",
" filtered_df = filtered_df.drop(columns=[col + \"_diff\" for col in compare_cols])\n",
"\n",
" \"\"\" \n",
" Create excel and add headers with filenames\n",
" \"\"\"\n",
" # write to an Excel file\n",
" with pd.ExcelWriter(output_file, engine='openpyxl') as writer:\n",
" filtered_df.to_excel(writer, index=False, sheet_name='Comparison')\n",
"\n",
" wb = openpyxl.load_workbook(output_file)\n",
" sheet = wb['Comparison']\n",
"\n",
" file_1_name = file1.split('/')[-1]\n",
" file_2_name = file2.split('/')[-1]\n",
"\n",
" sheet.insert_rows(1, amount=1) # Insert row at the top\n",
"\n",
" # Merge cells for file names\n",
" sheet.merge_cells(start_row=1, start_column=1, end_row=1, end_column=7) # File 1 header\n",
" sheet.merge_cells(start_row=1, start_column=9, end_row=1, end_column=sheet.max_column) # File 2 header\n",
"\n",
" # Write file names in merged cells\n",
" sheet.cell(row=1, column=1).value = f\"File: {file_1_name}\"\n",
" sheet.cell(row=1, column=9).value = f\"File: {file_2_name}\"\n",
"\n",
" # Make headers bold\n",
" bold_font = openpyxl.styles.Font(size=16, bold=True)\n",
" sheet.cell(row=1, column=1).font = bold_font\n",
" sheet.cell(row=1, column=9).font = bold_font\n",
"\n",
" \"\"\"\n",
" Add color formatting to sheet\n",
" \"\"\"\n",
" # Get original row indices from merged_df before filtering\n",
" filtered_indices = merged_df[\n",
" (merged_df.filter(like='_diff') == 'yellow').any(axis=1)\n",
" ].index.tolist()\n",
"\n",
" # Loop through filtered rows for applying highlighting\n",
" for filtered_idx, row_idx in enumerate(filtered_indices):\n",
" excel_row = filtered_idx + 3 # Excel rows start at 1, +3 for header rows\n",
" \n",
" # Highlight VIN column if it's missing in one file\n",
" vin_col_idx = merged_df.columns.get_loc('vin') + 1\n",
" vin_cell = sheet.cell(row=excel_row, column=vin_col_idx)\n",
" \n",
" # Check if VIN is unique to one of the files\n",
" vin_is_unique = False\n",
" if pd.isna(merged_df.loc[row_idx, 'registration_id_file_1']): # Unique to file_2\n",
" vin_cell.fill = PatternFill(start_color=\"00FF00\", end_color=\"00FF00\", fill_type=\"solid\") # Green\n",
" vin_is_unique = True\n",
" elif pd.isna(merged_df.loc[row_idx, 'registration_id_file_2']): # Unique to file_1\n",
" vin_cell.fill = PatternFill(start_color=\"00FF00\", end_color=\"00FF00\", fill_type=\"solid\") # Green\n",
" vin_is_unique = True\n",
" \n",
" # Apply yellow highlighting for mismatched columns, **skip if VIN is unique**\n",
" if not vin_is_unique:\n",
" for col in compare_cols:\n",
" file_1_col = col + \"_file_1\"\n",
" file_2_col = col + \"_file_2\"\n",
" \n",
" # Ensure columns exist\n",
" if file_1_col not in merged_df.columns or file_2_col not in merged_df.columns:\n",
" continue\n",
" \n",
" col_idx_1 = merged_df.columns.get_loc(file_1_col) + 1\n",
" col_idx_2 = merged_df.columns.get_loc(file_2_col) + 1\n",
" \n",
" # Check if the values in these columns differ\n",
" if merged_df.loc[row_idx, file_1_col] != merged_df.loc[row_idx, file_2_col]:\n",
" # Apply yellow highlighting for differences\n",
" sheet.cell(row=excel_row, column=col_idx_1).fill = PatternFill(start_color=\"FFFF00\", end_color=\"FFFF00\", fill_type=\"solid\") # Yellow\n",
" sheet.cell(row=excel_row, column=col_idx_2).fill = PatternFill(start_color=\"FFFF00\", end_color=\"FFFF00\", fill_type=\"solid\") # Yellow\n",
" wb.save(output_file)\n",
" print('Finished!')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"preprocessed_dataset = 'output-preprocessed.csv'\n",
"only_additions = 'output_only_additions.csv'\n",
"compare_csvs(preprocessed_dataset, only_additions, \"only_additions.xlsx\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
3 changes: 1 addition & 2 deletions openshift/scripts/import-data-from-local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ set -e

# 1 Argument = 'local container name or id'
# example command
# . ./import-data.sh 398cd4661173

# . ./import-data-from-local.sh 398cd4661173
if [ "$#" -ne 1 ]; then
echo "Passed $# parameters. Expected 1."
exit 1
Expand Down