diff --git a/check_biblio.py b/check_biblio.py index 495c1d9..72486ee 100755 --- a/check_biblio.py +++ b/check_biblio.py @@ -8,41 +8,68 @@ import argparse import sqlite3 import difflib -from typing import Optional +from typing import Optional, List, Tuple +import threading +from concurrent.futures import ThreadPoolExecutor, as_completed +import functools +import tqdm +import logging import bibtexparser +print_lock = threading.Lock() + class DataBase: """A simple database to store the original and final strings""" def __init__(self, filename: str): self.filename = filename - self.con = sqlite3.connect(filename) - self.cur = self.con.cursor() - self.cur.execute("CREATE TABLE IF NOT EXISTS entries(key, original, final)") + self._con = sqlite3.connect(filename, check_same_thread=False) + self._cur = self._con.cursor() + self._cur.execute("CREATE TABLE IF NOT EXISTS entries(key, original, final)") + self._lock = threading.Lock() + self._nupdates = 0 + self.substitutions: List[Tuple[str, str]] = [] def update(self, key: str, original: str, final: str): - self.cur.execute("INSERT INTO entries VALUES (?, ?, ?)", (key, original.strip(), final.strip())) + with self._lock: + self._cur.execute( + "INSERT INTO entries VALUES (?, ?, ?)", + (key, original.strip(), final.strip()), + ) + self._nupdates += 1 + if self._nupdates % 20 == 0: + self._con.commit() + if original != final: + self.substitutions.append((original, final)) def query(self, original: str): try: query = "SELECT * FROM entries WHERE original=?" - res = self.cur.execute(query, (original,)) - r = res.fetchone() + with self._lock: + res = self._cur.execute(query, (original,)) + r = res.fetchone() except sqlite3.OperationalError as ex: print(f"problem executing query {query} with {original}") print("error: %s" % ex) raise ex if r: - return r[2] - else: - return None + proposed = r[2] + if original != proposed: + with self._lock: + self.substitutions.append((original, proposed)) + return proposed + return None + + def commit(self): + with self._lock: + self._con.commit() def __del__(self): print("closing database") - self.con.commit() - self.con.close() + self._con.commit() + self._con.close() def diff_strings(a: str, b: str) -> str: @@ -67,6 +94,8 @@ def diff_strings(a: str, b: str) -> str: regex_unicode = re.compile("[^\x00-\x7F]") + + def help_unicode(item: str) -> Optional[str]: m = regex_unicode.search(item) if m: @@ -77,6 +106,7 @@ def help_unicode(item: str) -> Optional[str]: + "*****UNICODE******" + item[m.end() :] ) + return None def replace_unicode(item: str) -> str: @@ -92,7 +122,7 @@ def replace_unicode(item: str) -> str: def replace_chars(match): char = match.group(0) - print('unicode found, replacing "%s" with "%s"' % (char, chars[char])) + logging.debug('unicode found, replacing "%s" with "%s"', char, chars[char]) return chars[char] return re.sub("(" + "|".join(list(chars.keys())) + ")", replace_chars, item) @@ -117,18 +147,11 @@ def find_error_latex(filename: str) -> str: def modify_item(item: str, error: str) -> str: editor_command = os.environ.get("EDITOR") - if not editor_command: - print("you haven't defined a default EDITOR, (e.g. export EDITOR=emacs)") - editor_command = input( - "enter the command to open an editor (e.g. emacs/atom -w/...): " - ) - os.environ["EDITOR"] = editor_command - editor_command = editor_command.strip() tmp_filename = next(tempfile._get_candidate_names()) with open(tmp_filename, "w", encoding="utf-8") as f: - preamble = "do not delete these lines\n" "error found:\n" + preamble = "do not delete these lines\n" + "error found:\n" preamble += error r = help_unicode(item) if r is not None: @@ -214,32 +237,32 @@ def check_latex_entry(key: str, tex: str, use_bibtex: bool = False) -> Optional[ return error -def run_entry(entry, db, fix_unicode, substitutions): +def run_entry(entry, db, fix_unicode) -> None: raw_original = entry.raw.strip() - raw_proposed = raw_original from_cache = db.query(raw_original) if from_cache is not None: - if raw_original != from_cache: - substitutions.append((raw_original, from_cache)) return + raw_proposed = raw_original + if fix_unicode: raw_proposed = replace_unicode(raw_original) if raw_proposed != raw_original: - print(f"unicode found in {entry.key}, fixing") + logging.debug("unicode found in %s, fixing", entry.key) while True: error = check_latex_entry(entry.key, raw_proposed, args.use_bibtex) if error is None: break - print(f"problem running item {entry.key}") - raw_proposed = modify_item(raw_proposed, error).strip() + with print_lock: + print(f"problem running item {entry.key}") + raw_proposed = modify_item(raw_proposed, error).strip() if raw_original != raw_proposed: - print(diff_strings(raw_original, raw_proposed)) - substitutions.append((raw_original, raw_proposed)) + with print_lock: + print(diff_strings(raw_original, raw_proposed)) db.update(entry.key, raw_original, raw_proposed) @@ -251,13 +274,21 @@ def run_entry(entry, db, fix_unicode, substitutions): ) parser.add_argument("bibtex", default="https://inspirehep.net/") parser.add_argument("--fix-unicode", action="store_true") + parser.add_argument("--nthreads", type=int, default=5) parser.add_argument( "--use-bibtex", action="store_true", help="use bibtex instead of biblatex" ) args = parser.parse_args() + editor_command = os.environ.get("EDITOR") + if not editor_command: + print("you haven't defined a default EDITOR, (e.g. export EDITOR=emacs)") + editor_command = input( + "enter the command to open an editor (e.g. emacs/atom -w/...): " + ) + os.environ["EDITOR"] = editor_command.strip() + try: - substitutions = [] biblio_parsed = bibtexparser.parse_file(args.bibtex) db = DataBase("db.sqlite") @@ -267,24 +298,44 @@ def run_entry(entry, db, fix_unicode, substitutions): print("found %d entries" % len(biblio_parsed.entries)) nentries = len(biblio_parsed.entries) - # import multiprocessing - # import functools - # p = multiprocessing.Pool(4) - # p.map(functools.partial(run_entry, cur=cur, fix_unicode=args.fix_unicode, substitutions=substitutions), biblio_parsed.entries) - for ientry, entry in enumerate(biblio_parsed.entries, 1): - print("checking key %s %d/%d" % (entry.key, ientry, nentries)) - run_entry(entry, db, args.fix_unicode, substitutions) + with ThreadPoolExecutor(max_workers=args.nthreads) as p: + with tqdm.tqdm(total=nentries) as pbar: + pbar.set_description("checking entries") + pbar.set_postfix_str(f"nthreads={args.nthreads}") + + def partial_function(entry): + with print_lock: + pbar.set_description(entry.key) + run_entry(entry, db, args.fix_unicode) + with print_lock: + pbar.update() + + futures = {} + + for entry in biblio_parsed.entries: + future = p.submit(partial_function, entry) + futures[future] = entry.key + for future in as_completed(futures): + key = futures[future] + try: + future.result() + except Exception as ex: + with print_lock: + pbar.write(f"problem with entry: {key}") + raise ex + with print_lock: + pbar.set_description(key) finally: - biblio = open(args.bibtex, encoding="utf-8").read() + substitutions = db.substitutions print(f"applying {len(substitutions)} substitutions") for old, new in substitutions: if old == new: print("BIG PROBLEM: old == new") print(diff_strings(old, new)) if old not in biblio: - print("BIG PROBLEM: old not in biblio") + print("BIG PROBLEM: old not in biblio: %s" % old) biblio = biblio.replace(old, new) new_biblio_fn = args.bibtex.replace(".bib", "_new.bib") with open(new_biblio_fn, "w", encoding="utf-8") as f: diff --git a/requirements.txt b/requirements.txt index 1c9a5eb..82e4713 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -bibtexparser==2.0.0b4 \ No newline at end of file +bibtexparser==2.0.0b4 +tqdm