Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Alignment data export #409

Closed
wants to merge 31 commits into from
Closed
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
e7667b8
Add sign export script
fsimonjetz Apr 3, 2023
e9894f8
add pandas to dependencies for export script
fsimonjetz Apr 3, 2023
6ec8941
Remove debug limit
fsimonjetz Apr 4, 2023
6f6ea7a
'Refactored by Sourcery' (#410)
sourcery-ai[bot] Apr 4, 2023
30fb091
Refactoring
fsimonjetz Apr 4, 2023
7064a5b
export template
fsimonjetz Apr 4, 2023
40a1731
add caic reports io
fsimonjetz Apr 5, 2023
7efa57e
fix reference aggregation
fsimonjetz Apr 5, 2023
f1e74fd
Move team names into environment variable
fsimonjetz Apr 5, 2023
23fc057
Fix monthly index
fsimonjetz Apr 5, 2023
9ba5f42
Less restrictive field matching
fsimonjetz Apr 5, 2023
58dde97
better references display
fsimonjetz Apr 13, 2023
77b5d98
Merge branch 'master' into alignment-data-export
fsimonjetz Apr 13, 2023
6be4274
Refactoring; add zip to export
fsimonjetz Apr 13, 2023
76f487e
Ignore output files
fsimonjetz Apr 13, 2023
c7ef43d
Use tar.gz instead of zip
fsimonjetz Apr 13, 2023
9ff50e7
Merge branch 'extend-revision-record' into alignment-data-export
fsimonjetz Apr 14, 2023
d635c93
Remove unrelated changes
fsimonjetz Apr 14, 2023
2a8037a
remove vocab generation
fsimonjetz Apr 17, 2023
b55a3cb
Remove old export script
fsimonjetz Apr 21, 2023
9d1eb91
include more chapter infos
fsimonjetz Apr 25, 2023
674e83c
Remove filtering "empty" texts
fsimonjetz Apr 25, 2023
b12ba58
use stage abbreviation instead of long name;
fsimonjetz Apr 25, 2023
bd0a703
Fix dtypes; add url column
fsimonjetz May 10, 2023
e9ad32f
include colophon and unplaced line counts
fsimonjetz Jun 1, 2023
d7c4eae
Always use the production db for export
fsimonjetz Jun 1, 2023
a9456c6
add colophon option
fsimonjetz Jun 2, 2023
1caa714
Merge branch 'master' into alignment-data-export
fsimonjetz Jun 2, 2023
82b55ab
add sign export task
fsimonjetz Jun 2, 2023
41b6c36
reformatting
fsimonjetz Jun 2, 2023
044cf3b
Merge branch 'master' into alignment-data-export
fsimonjetz Jun 2, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactoring; add zip to export
fsimonjetz committed Apr 13, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit 6be4274f1c4ee1ef90c627bf12c269b88ef529c8
Empty file added ebl/io/alignment/__init__.py
Empty file.
Binary file added ebl/io/alignment/alignment_data_2023-04-13.zip
Binary file not shown.
131 changes: 131 additions & 0 deletions ebl/io/alignment/data_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from pymongo import MongoClient
from ebl.mongo_collection import MongoCollection
from ebl.transliteration.infrastructure.collections import (
FRAGMENTS_COLLECTION,
CHAPTERS_COLLECTION,
)
from ebl.corpus.domain.manuscript import (
ManuscriptType,
Provenance,
)
from ebl.common.domain.period import Period
import os
import shutil
import pandas as pd
import numpy as np
import datetime

# disable false positive SettingsWithCopyWarning
pd.options.mode.chained_assignment = None

client = MongoClient(os.environ["MONGODB_URI"])
DB = os.environ.get("MONGODB_DB")
database = client.get_database(DB)
fragments = MongoCollection(database, FRAGMENTS_COLLECTION)
chapters = MongoCollection(database, CHAPTERS_COLLECTION)

tmp_path = "/tmp/alignment_output"
os.makedirs(tmp_path, exist_ok=True)


def enum_mapping(enum):
return {enum_item.long_name: enum_item.abbreviation for enum_item in enum}


if __name__ == "__main__":
df_fragments = None
df_chapters = None

print(f"Writing data to {tmp_path}...")
print("Exporting fragments...")

fragment_signs = fragments.find_many(
{"signs": {"$exists": True, "$ne": ""}}, projection={"signs": True}
)
df_fragments = pd.DataFrame.from_records(fragment_signs)

# exclude signs without clear signs
df_fragments = df_fragments[~df_fragments.signs.str.fullmatch(r"[X\s]*")]

df_fragments.to_csv(
os.path.join(tmp_path, "fragment_signs.tsv"), index=False, sep="\t"
)

print("Exporting chapters...")

siglum_columns = ["provenance", "period", "type", "disambiguator"]
siglum_enums = [Provenance, Period, ManuscriptType]

abbreviation_mappings = dict(zip(siglum_columns, siglum_enums))

chapter_signs = chapters.aggregate(
[
{
"$project": {
"manuscripts": {"$zip": {"inputs": ["$manuscripts", "$signs"]}}
}
},
{"$unwind": "$manuscripts"},
{
"$project": {
"manuscript": {"$first": "$manuscripts"},
"signs": {"$last": "$manuscripts"},
}
},
{
"$project": {
"_id": 0,
"id": {"$toString": "$_id"},
"provenance": "$manuscript.provenance",
"period": "$manuscript.period",
"type": "$manuscript.type",
"disambiguator": "$manuscript.siglumDisambiguator",
"signs": 1,
}
},
]
)
df_chapters = pd.DataFrame.from_records(chapter_signs)

# exclude signs without clear signs
df_chapters["signs"] = df_chapters["signs"].fillna("")
df_chapters = df_chapters[~df_chapters.signs.str.fullmatch(r"[X\s]*")]

# map long names to abbreviations
for column, enum in abbreviation_mappings.items():
df_chapters[column] = df_chapters[column].map(enum_mapping(enum))

df_chapters["siglum"] = df_chapters[siglum_columns].agg("".join, axis=1)
df_chapters = df_chapters[
[
"id",
"siglum",
"signs",
]
]

df_chapters.to_csv(
os.path.join(tmp_path, "chapter_signs.tsv"), index=False, sep="\t"
)

print("Exporting vocabulary...")
frames = [frame for frame in (df_fragments, df_chapters) if frame is not None]

all_signs = pd.concat(frames)["signs"].str.split().explode().unique()
_, codes = np.unique(all_signs, return_inverse=True)

df = pd.DataFrame({"sign": all_signs, "code": codes})
df.to_csv(os.path.join(tmp_path, "vocabulary.tsv"), index=False, sep="\t")

zip_path = os.path.join(
os.path.dirname(__file__), f"alignment_data_{datetime.date.today()}"
)

print(f"Storing archive in '{zip_path}.zip'...")
shutil.make_archive(
zip_path,
"zip",
tmp_path,
)

print("Done.")