Skip to content

Commit f0e1e8c

Browse files
Improved onboard_project.py (#783)
* Improved onboard_project.py -Added collect_verse_counts to onboard_project -Replaced command line args with a yaml config * Updated onboard_project * Made --config optional and added args -Made --extract-corpora and --collect-verse-counts args without requiring a config, using default values to run both tasks. * Fixed some bugs with collect_verse_counts defaults * Check project files before collect_verse_counts
1 parent 18588d3 commit f0e1e8c

File tree

3 files changed

+85
-38
lines changed

3 files changed

+85
-38
lines changed

silnlp/common/collect_verse_counts.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,10 +264,8 @@ def main() -> None:
264264
# If the output folder doesn't exist locally, assume it's an experiment folder
265265
folder = args.folder.replace("\\", "/")
266266
if Path(folder).exists():
267-
exp_name = None
268267
folder = Path(folder)
269268
else:
270-
exp_name = folder
271269
folder = get_mt_exp_dir(folder)
272270

273271
# If no files are listed and folder is an experiment, use the files listed in the config file

silnlp/common/extract_corpora.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ def extract_corpora(
105105
extract_lemmas,
106106
extract_project_vrefs,
107107
)
108+
LOGGER.info(f"Extracted corpus file: {corpus_filename}")
108109
# check if the number of lines in the file is correct (the same as vref.txt)
109110
LOGGER.info(f"# of Verses: {verse_count}")
110111
if verse_count != expected_verse_count:

silnlp/common/onboard_project.py

Lines changed: 84 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
import logging
33
from pathlib import Path
44

5+
import yaml
6+
7+
from .collect_verse_counts import collect_verse_counts
58
from .environment import SIL_NLP_ENV
9+
from .extract_corpora import extract_corpora
610

711
LOGGER = logging.getLogger(__package__ + ".onboard_project")
812

@@ -62,70 +66,114 @@ def main() -> None:
6266
parser.add_argument(
6367
"project",
6468
help="Paratext project name. The project will be stored on the bucket at Paratext/projects/<project>.",
69+
type=str,
6570
)
6671
parser.add_argument(
6772
"--copy-from",
6873
help="Path to a downloaded Paratext project folder. The local project will be copied to the bucket.",
6974
default=None,
75+
type=str,
7076
)
7177
parser.add_argument(
72-
"--overwrite", help="Overwrite any existing files and folders", default=False, action="store_true"
73-
)
74-
parser.add_argument(
75-
"--extract-corpora",
76-
help="Extract text corpora.",
77-
default=False,
78-
action="store_true",
79-
)
80-
parser.add_argument(
81-
"--include",
82-
metavar="books",
83-
nargs="+",
84-
default=[],
85-
help="The books to include; e.g., 'NT', 'OT', 'GEN'. Only used with extract-corpora.",
86-
)
87-
parser.add_argument(
88-
"--exclude",
89-
metavar="books",
90-
nargs="+",
91-
default=[],
92-
help="The books to exclude; e.g., 'NT', 'OT', 'GEN'. Only used with extract-corpora.",
78+
"--config",
79+
help="Path to a configuration file in YAML format. This is used to configure the onboarding process.",
80+
default=None,
81+
type=str,
9382
)
9483
parser.add_argument(
95-
"--markers", default=False, action="store_true", help="Include USFM markers. Only used with extract-corpora."
84+
"--overwrite", help="Overwrite any existing files and folders", default=False, action="store_true"
9685
)
86+
9787
parser.add_argument(
98-
"--lemmas",
88+
"--extract-corpora",
9989
default=False,
10090
action="store_true",
101-
help="Extract lemmas if available. Only used with extract-corpora.",
91+
help="Extract text corpora from the Paratext project.",
10292
)
93+
10394
parser.add_argument(
104-
"--project-vrefs",
95+
"--collect-verse-counts",
10596
default=False,
10697
action="store_true",
107-
help="Extract project verse refs. Only used with extract-corpora.",
98+
help="Collect various counts from the extracted Paratext project.",
10899
)
109100

110101
args = parser.parse_args()
111-
project_name = args.project
102+
if not args.project:
103+
raise ValueError("Project name is required. Please provide a valid Paratext project name using <project>.")
112104

113-
LOGGER.info(f"Onboarding project: {args.project}")
114-
paratext_project_dir: Path = create_paratext_project_folder_if_not_exists(project_name)
105+
project_name = args.project
115106

116107
if args.copy_from:
108+
LOGGER.info(f"Onboarding project: {args.project}")
109+
paratext_project_dir: Path = create_paratext_project_folder_if_not_exists(project_name)
117110
copy_paratext_project_folder(Path(args.copy_from), paratext_project_dir, overwrite=args.overwrite)
118111

119-
if args.extract_corpora:
120-
from .extract_corpora import extract_corpora
112+
if args.config:
113+
config_file = Path(args.config)
114+
if not config_file.exists():
115+
raise FileNotFoundError(f"Config file '{config_file}' does not exist.")
116+
with config_file.open("r", encoding="utf-8") as file:
117+
config = yaml.safe_load(file)
118+
else:
119+
raise ValueError("Config file is required. Please provide a valid configuration file using --config.")
121120

121+
if args.extract_corpora:
122+
LOGGER.info(f"Extracting {project_name}.")
122123
extract_corpora(
123124
projects={project_name},
124-
books_to_include=args.include,
125-
books_to_exclude=args.exclude,
126-
include_markers=args.markers,
127-
extract_lemmas=args.lemmas,
128-
extract_project_vrefs=args.project_vrefs,
125+
books_to_include=config["extract_corpora"]["include"] if "include" in config["extract_corpora"] else [],
126+
books_to_exclude=config["extract_corpora"]["exclude"] if "exclude" in config["extract_corpora"] else [],
127+
include_markers=(config["extract_corpora"]["markers"] if "markers" in config["extract_corpora"] else False),
128+
extract_lemmas=config["extract_corpora"]["lemmas"] if "lemmas" in config["extract_corpora"] else False,
129+
extract_project_vrefs=(
130+
config["extract_corpora"]["project-vrefs"] if "project-vrefs" in config["extract_corpora"] else False
131+
),
132+
)
133+
134+
if args.collect_verse_counts:
135+
if not args.extract_corpora:
136+
LOGGER.warning(
137+
"--extract_corpora was not included. Collecting verse counts requires the corpus to be extracted first."
138+
)
139+
140+
LOGGER.info(f"Collecting verse counts from {project_name}.")
141+
142+
if config["verse_counts"]["output_folder"]:
143+
output_folder = Path(config["verse_counts"]["output_folder"])
144+
if not output_folder.exists():
145+
output_folder.mkdir(parents=True, exist_ok=True)
146+
else:
147+
output_folder = SIL_NLP_ENV.mt_experiments_dir / "verse_counts" / project_name
148+
if not output_folder.exists():
149+
output_folder.mkdir(parents=True, exist_ok=True)
150+
input_folder = (
151+
config["verse_counts"]["input_folder"]
152+
if "input_folder" in config["verse_counts"]
153+
else SIL_NLP_ENV.mt_scripture_dir
154+
)
155+
file_patterns = (
156+
config["verse_counts"]["files"] if "files" in config["verse_counts"] else f"*{project_name}*.txt"
157+
)
158+
159+
input_folder_path = Path(input_folder)
160+
if not input_folder_path.exists():
161+
LOGGER.error(f"Input folder '{input_folder_path}' does not exist. Skipping verse counts collection.")
162+
return
163+
164+
matched_files = list(input_folder_path.glob(file_patterns))
165+
if not matched_files:
166+
LOGGER.error(
167+
f"No files matching pattern '{file_patterns}' found in '{input_folder_path}'. Skipping verse counts collection."
168+
)
169+
return
170+
171+
collect_verse_counts(
172+
input_folder=input_folder_path,
173+
output_folder=output_folder,
174+
file_patterns=file_patterns,
175+
deutero=config["verse_counts"]["deutero"] if "deutero" in config["verse_counts"] else False,
176+
recount=config["verse_counts"]["recount"] if "recount" in config["verse_counts"] else False,
129177
)
130178

131179

0 commit comments

Comments
 (0)