diff --git a/python/scripts.py b/python/scripts.py index e0b5f81..fc12418 100644 --- a/python/scripts.py +++ b/python/scripts.py @@ -3,7 +3,7 @@ from typing import assert_never from util.biographies import export_biographies -from util.partial_matches import find_partial_matches +from util.partial_matches import PartialMatcher from util.persistent_identifiers import print_all_identifiers @@ -40,7 +40,8 @@ def _main() -> None: case Command.EXPORT_BIOGRAPHIES: export_biographies(arguments.print_to_file) case Command.FIND_PARTIAL_MATCHES: - find_partial_matches(arguments.print_to_file, arguments.sanitize) + matcher = PartialMatcher() + matcher.run(arguments.print_to_file, arguments.sanitize) case _: assert_never(command) diff --git a/python/util/partial_matches.py b/python/util/partial_matches.py index 4cf3aa4..f4016d3 100644 --- a/python/util/partial_matches.py +++ b/python/util/partial_matches.py @@ -1,47 +1,99 @@ +import json import os +from collections import defaultdict from pathlib import Path from openpyxl import load_workbook -from xlsx_functions.helper_functions import compare_rows, is_partial_match from xlsx_make import create_sanitized_xlsx +_HUMAN_READABLE_COLUMN_NAMES: dict[int, str] = { + 1: "Title", + 2: "Year", + 3: "Month", + 4: "Day", + 5: "Place", + 6: "Author", + 7: "Recipient", + 8: "Subject", +} -# pylint: disable-next=unused-argument -def find_partial_matches(print_to_file: bool, sanitize: bool) -> None: - """Find all rows that are partial matches.""" - input_dir = Path("inputs") / "VolumesExcel" / "it_IT" - if sanitize: - create_sanitized_xlsx(str(input_dir)) - sanitized_dir = ( - str(input_dir) - .replace("inputs", "outputs") - .replace("VolumesExcel", "VolumesExcelSanitized") - ) - files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")] - count = 0 - count_all = 0 - for file in sorted( - files, - key=lambda name: int( - name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "") - ), - ): + +class PartialMatcher: + + def __init__(self) -> None: + self.partial_matches: dict[int, defaultdict[str, list[str]]] = { + 8: defaultdict(list), + 7: defaultdict(list), + 6: defaultdict(list), + 5: defaultdict(list), + 4: defaultdict(list), + 3: defaultdict(list), + 2: defaultdict(list), + 1: defaultdict(list), + } + self.total_count = 0 + + def run(self, print_to_file: bool, sanitize: bool) -> None: + """Find all rows that are partial matches.""" + input_dir = Path("inputs") / "VolumesExcel" / "it_IT" + if sanitize: + create_sanitized_xlsx(str(input_dir)) + sanitized_dir = ( + str(input_dir) + .replace("inputs", "outputs") + .replace("VolumesExcel", "VolumesExcelSanitized") + ) + files = [i for i in os.listdir(sanitized_dir) if i.startswith("Paesi")] + + for file in sorted( + files, + key=lambda name: int( + name.replace("Paesi Bassi VOLUME ", "").replace("_it_IT.xlsx", "") + ), + ): + self._find_partial_match(file, sanitized_dir) + + print(self.total_count) + + final_dict = { + k: {i: {"count": len(j), "files": j} for i, j in v.items()} + for k, v in self.partial_matches.items() + if k != 8 + } + + with open("outputs/partial_matches.json", "w", encoding="utf-8") as file: + json.dump(final_dict, file, ensure_ascii=False, indent=4) + + def _find_partial_match(self, file: str, sanitized_dir: str) -> None: workbook = load_workbook(Path(sanitized_dir) / file) first_sheet = workbook[workbook.sheetnames[0]] prev_row = None - for row in first_sheet.iter_rows(): - count_all += 1 - if prev_row is None: + for row in first_sheet.iter_rows(values_only=True): + self.total_count += 1 + + if prev_row is None or row[0] is None: prev_row = row continue - if not compare_rows(prev_row, row) and is_partial_match(prev_row, row): - count += 1 - print("Prev row:") - print(" ".join([str(i.value) for i in prev_row])) - print("Row:") - print(" ".join([str(i.value) for i in row])) + matching_indices: set[int] = set() + for index, (old_value, new_value) in enumerate(zip(prev_row, row)): + # We only care about the first 7 columns + if index > 7: + break + + if old_value == new_value: + matching_indices.add(index) + + for i in range(index, 9): + matching_indices.add(i) + + if matching_indices: + self.partial_matches[len(matching_indices)][ + ", ".join( + _HUMAN_READABLE_COLUMN_NAMES[i] + for i in sorted(matching_indices) + ) + ].append(str(row[0])) + prev_row = row - print(count) - print(count_all) diff --git a/python/xlsx_functions/__init__.py b/python/xlsx_functions/__init__.py index 1f5917b..bd7a3a4 100644 --- a/python/xlsx_functions/__init__.py +++ b/python/xlsx_functions/__init__.py @@ -1,4 +1,5 @@ from xlsx_functions.fill_in_names import fill_in_xlsx +from xlsx_functions.grouped_scans import add_grouped_scans_column from xlsx_functions.helper_functions import compare_rows from xlsx_functions.identifier_columns import add_identifier_columns from xlsx_functions.parse import parse_file, parse_series @@ -13,4 +14,5 @@ "translate_xlsx", "compare_rows", "add_identifier_columns", + "add_grouped_scans_column", ] diff --git a/python/xlsx_functions/grouped_scans.py b/python/xlsx_functions/grouped_scans.py new file mode 100644 index 0000000..079ae56 --- /dev/null +++ b/python/xlsx_functions/grouped_scans.py @@ -0,0 +1,32 @@ +import os + +from openpyxl import load_workbook + +from xlsx_functions.helper_functions import compare_rows + + +# pylint: disable-next=too-many-branches, too-many-locals, too-many-nested-blocks +def add_grouped_scans_column(directory_name: str, file_name: str) -> None: + """Create and write a .xlsx file with identifier columns.""" + workbook = load_workbook(f"{directory_name}/{file_name}") + sheet = workbook[workbook.sheetnames[0]] + + sheet.insert_cols(2) + + last_row = None + # pylint: disable-next=too-many-nested-blocks + for index, row in enumerate(sheet.iter_rows(), start=1): + if last_row is not None and compare_rows(row, last_row, start_index=2): + sheet.cell(row=index, column=2).value = last_row[0].value + else: + last_row = row + sheet.cell(row=index, column=2).value = row[0].value + + new_directory = directory_name.replace("inputs", "outputs").replace( + "VolumesExcel/", "VolumesExcelSanitized/" + ) + os.makedirs( + os.path.join(os.getcwd(), new_directory), + exist_ok=True, + ) + workbook.save(f"{new_directory}/{file_name}") diff --git a/python/xlsx_functions/helper_functions.py b/python/xlsx_functions/helper_functions.py index f835deb..0adcd91 100644 --- a/python/xlsx_functions/helper_functions.py +++ b/python/xlsx_functions/helper_functions.py @@ -1,9 +1,13 @@ from openpyxl.cell.cell import Cell -def compare_rows(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool: +def compare_rows( + row1: tuple[Cell, ...], row2: tuple[Cell, ...], start_index: int = 1 +) -> bool: """Compare the values of two rows.""" - return [i.value for i in row1[1:]] == [i.value for i in row2[1:]] + return [i.value for i in row1[start_index:]] == [ + i.value for i in row2[start_index:] + ] def is_partial_match(row1: tuple[Cell, ...], row2: tuple[Cell, ...]) -> bool: diff --git a/python/xlsx_make.py b/python/xlsx_make.py index d41bff0..9f34939 100644 --- a/python/xlsx_make.py +++ b/python/xlsx_make.py @@ -9,6 +9,7 @@ from data_parsing import initialize_database_for_xml from openpyxl import Workbook, load_workbook from xlsx_functions import ( + add_grouped_scans_column, add_identifier_columns, fill_in_xlsx, sanitize_xlsx, @@ -173,6 +174,17 @@ def create_xlsx_with_identifier_columns(directory_name: str) -> None: add_identifier_columns(directory_name, filename, surnames) +def create_xlsx_with_grouped_scans_column(directory_name: str) -> None: + """Create .xlsx files while adding a column for grouped scans.""" + + directory_path = os.path.realpath(directory_name) + for file in sorted(os.listdir(directory_path)): + print(file) + if not str(file).count("~$") and str(file).startswith("Paesi"): + filename = os.fsdecode(file) + add_grouped_scans_column(directory_name, filename) + + def do_full_loop() -> None: """Completes the full process of input files till seperate translations and control file.""" print("STARTING CREATION OF .XLSX DOCUMENTS\n") @@ -198,7 +210,7 @@ def do_full_loop() -> None: if __name__ == "__main__": - create_xlsx_with_identifier_columns("inputs/VolumesExcel/it_IT") + create_xlsx_with_grouped_scans_column("inputs/VolumesExcel/it_IT") # create_sanitized_xlsx("inputs/VolumesExcel/it_IT") # create_translated_xlsx("outputs/VolumesExcelSanitized/it_IT", "en_GB") # create_translated_xlsx("outputs/VolumesExcelSanitized/it_IT", "nl_NL")