Skip to content

Commit

Permalink
Merge pull request #2 from wiso/threading
Browse files Browse the repository at this point in the history
Threading
  • Loading branch information
wiso authored Nov 16, 2023
2 parents 3061424 + 18dda1f commit c3048c6
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 41 deletions.
131 changes: 91 additions & 40 deletions check_biblio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,68 @@
import argparse
import sqlite3
import difflib
from typing import Optional
from typing import Optional, List, Tuple
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import functools

Check failure on line 14 in check_biblio.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

check_biblio.py:14:8: F401 `functools` imported but unused
import tqdm
import logging

import bibtexparser

print_lock = threading.Lock()


class DataBase:
"""A simple database to store the original and final strings"""

def __init__(self, filename: str):
self.filename = filename
self.con = sqlite3.connect(filename)
self.cur = self.con.cursor()
self.cur.execute("CREATE TABLE IF NOT EXISTS entries(key, original, final)")
self._con = sqlite3.connect(filename, check_same_thread=False)
self._cur = self._con.cursor()
self._cur.execute("CREATE TABLE IF NOT EXISTS entries(key, original, final)")
self._lock = threading.Lock()
self._nupdates = 0
self.substitutions: List[Tuple[str, str]] = []

def update(self, key: str, original: str, final: str):
self.cur.execute("INSERT INTO entries VALUES (?, ?, ?)", (key, original.strip(), final.strip()))
with self._lock:
self._cur.execute(
"INSERT INTO entries VALUES (?, ?, ?)",
(key, original.strip(), final.strip()),
)
self._nupdates += 1
if self._nupdates % 20 == 0:
self._con.commit()
if original != final:
self.substitutions.append((original, final))

def query(self, original: str):
try:
query = "SELECT * FROM entries WHERE original=?"
res = self.cur.execute(query, (original,))
r = res.fetchone()
with self._lock:
res = self._cur.execute(query, (original,))
r = res.fetchone()
except sqlite3.OperationalError as ex:
print(f"problem executing query {query} with {original}")
print("error: %s" % ex)
raise ex
if r:
return r[2]
else:
return None
proposed = r[2]
if original != proposed:
with self._lock:
self.substitutions.append((original, proposed))
return proposed
return None

def commit(self):
with self._lock:
self._con.commit()

def __del__(self):
print("closing database")
self.con.commit()
self.con.close()
self._con.commit()
self._con.close()


def diff_strings(a: str, b: str) -> str:
Expand All @@ -67,6 +94,8 @@ def diff_strings(a: str, b: str) -> str:


regex_unicode = re.compile("[^\x00-\x7F]")


def help_unicode(item: str) -> Optional[str]:
m = regex_unicode.search(item)
if m:
Expand All @@ -77,6 +106,7 @@ def help_unicode(item: str) -> Optional[str]:
+ "*****UNICODE******"
+ item[m.end() :]
)
return None


def replace_unicode(item: str) -> str:
Expand All @@ -92,7 +122,7 @@ def replace_unicode(item: str) -> str:

def replace_chars(match):
char = match.group(0)
print('unicode found, replacing "%s" with "%s"' % (char, chars[char]))
logging.debug('unicode found, replacing "%s" with "%s"', char, chars[char])
return chars[char]

return re.sub("(" + "|".join(list(chars.keys())) + ")", replace_chars, item)
Expand All @@ -117,18 +147,11 @@ def find_error_latex(filename: str) -> str:

def modify_item(item: str, error: str) -> str:
editor_command = os.environ.get("EDITOR")
if not editor_command:
print("you haven't defined a default EDITOR, (e.g. export EDITOR=emacs)")
editor_command = input(
"enter the command to open an editor (e.g. emacs/atom -w/...): "
)
os.environ["EDITOR"] = editor_command
editor_command = editor_command.strip()

tmp_filename = next(tempfile._get_candidate_names())

with open(tmp_filename, "w", encoding="utf-8") as f:
preamble = "do not delete these lines\n" "error found:\n"
preamble = "do not delete these lines\n" + "error found:\n"
preamble += error
r = help_unicode(item)
if r is not None:
Expand Down Expand Up @@ -214,32 +237,32 @@ def check_latex_entry(key: str, tex: str, use_bibtex: bool = False) -> Optional[
return error


def run_entry(entry, db, fix_unicode, substitutions):
def run_entry(entry, db, fix_unicode) -> None:
raw_original = entry.raw.strip()
raw_proposed = raw_original

from_cache = db.query(raw_original)
if from_cache is not None:
if raw_original != from_cache:
substitutions.append((raw_original, from_cache))
return

raw_proposed = raw_original

if fix_unicode:
raw_proposed = replace_unicode(raw_original)
if raw_proposed != raw_original:
print(f"unicode found in {entry.key}, fixing")
logging.debug("unicode found in %s, fixing", entry.key)

while True:
error = check_latex_entry(entry.key, raw_proposed, args.use_bibtex)
if error is None:
break

print(f"problem running item {entry.key}")
raw_proposed = modify_item(raw_proposed, error).strip()
with print_lock:
print(f"problem running item {entry.key}")
raw_proposed = modify_item(raw_proposed, error).strip()

if raw_original != raw_proposed:
print(diff_strings(raw_original, raw_proposed))
substitutions.append((raw_original, raw_proposed))
with print_lock:
print(diff_strings(raw_original, raw_proposed))
db.update(entry.key, raw_original, raw_proposed)


Expand All @@ -251,13 +274,21 @@ def run_entry(entry, db, fix_unicode, substitutions):
)
parser.add_argument("bibtex", default="https://inspirehep.net/")
parser.add_argument("--fix-unicode", action="store_true")
parser.add_argument("--nthreads", type=int, default=5)
parser.add_argument(
"--use-bibtex", action="store_true", help="use bibtex instead of biblatex"
)
args = parser.parse_args()

editor_command = os.environ.get("EDITOR")
if not editor_command:
print("you haven't defined a default EDITOR, (e.g. export EDITOR=emacs)")
editor_command = input(
"enter the command to open an editor (e.g. emacs/atom -w/...): "
)
os.environ["EDITOR"] = editor_command.strip()

try:
substitutions = []
biblio_parsed = bibtexparser.parse_file(args.bibtex)
db = DataBase("db.sqlite")

Expand All @@ -267,24 +298,44 @@ def run_entry(entry, db, fix_unicode, substitutions):
print("found %d entries" % len(biblio_parsed.entries))

nentries = len(biblio_parsed.entries)
# import multiprocessing
# import functools
# p = multiprocessing.Pool(4)
# p.map(functools.partial(run_entry, cur=cur, fix_unicode=args.fix_unicode, substitutions=substitutions), biblio_parsed.entries)
for ientry, entry in enumerate(biblio_parsed.entries, 1):
print("checking key %s %d/%d" % (entry.key, ientry, nentries))
run_entry(entry, db, args.fix_unicode, substitutions)

with ThreadPoolExecutor(max_workers=args.nthreads) as p:
with tqdm.tqdm(total=nentries) as pbar:
pbar.set_description("checking entries")
pbar.set_postfix_str(f"nthreads={args.nthreads}")

def partial_function(entry):
with print_lock:
pbar.set_description(entry.key)
run_entry(entry, db, args.fix_unicode)
with print_lock:
pbar.update()

futures = {}

for entry in biblio_parsed.entries:
future = p.submit(partial_function, entry)
futures[future] = entry.key
for future in as_completed(futures):
key = futures[future]
try:
future.result()
except Exception as ex:
with print_lock:
pbar.write(f"problem with entry: {key}")
raise ex
with print_lock:
pbar.set_description(key)
finally:

biblio = open(args.bibtex, encoding="utf-8").read()
substitutions = db.substitutions
print(f"applying {len(substitutions)} substitutions")
for old, new in substitutions:
if old == new:
print("BIG PROBLEM: old == new")
print(diff_strings(old, new))
if old not in biblio:
print("BIG PROBLEM: old not in biblio")
print("BIG PROBLEM: old not in biblio: %s" % old)
biblio = biblio.replace(old, new)
new_biblio_fn = args.bibtex.replace(".bib", "_new.bib")
with open(new_biblio_fn, "w", encoding="utf-8") as f:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
bibtexparser==2.0.0b4
bibtexparser==2.0.0b4
tqdm

0 comments on commit c3048c6

Please sign in to comment.