|
2 | 2 | import logging |
3 | 3 | from pathlib import Path |
4 | 4 |
|
| 5 | +import yaml |
| 6 | + |
| 7 | +from .collect_verse_counts import collect_verse_counts |
5 | 8 | from .environment import SIL_NLP_ENV |
| 9 | +from .extract_corpora import extract_corpora |
6 | 10 |
|
7 | 11 | LOGGER = logging.getLogger(__package__ + ".onboard_project") |
8 | 12 |
|
@@ -62,70 +66,114 @@ def main() -> None: |
62 | 66 | parser.add_argument( |
63 | 67 | "project", |
64 | 68 | help="Paratext project name. The project will be stored on the bucket at Paratext/projects/<project>.", |
| 69 | + type=str, |
65 | 70 | ) |
66 | 71 | parser.add_argument( |
67 | 72 | "--copy-from", |
68 | 73 | help="Path to a downloaded Paratext project folder. The local project will be copied to the bucket.", |
69 | 74 | default=None, |
| 75 | + type=str, |
70 | 76 | ) |
71 | 77 | parser.add_argument( |
72 | | - "--overwrite", help="Overwrite any existing files and folders", default=False, action="store_true" |
73 | | - ) |
74 | | - parser.add_argument( |
75 | | - "--extract-corpora", |
76 | | - help="Extract text corpora.", |
77 | | - default=False, |
78 | | - action="store_true", |
79 | | - ) |
80 | | - parser.add_argument( |
81 | | - "--include", |
82 | | - metavar="books", |
83 | | - nargs="+", |
84 | | - default=[], |
85 | | - help="The books to include; e.g., 'NT', 'OT', 'GEN'. Only used with extract-corpora.", |
86 | | - ) |
87 | | - parser.add_argument( |
88 | | - "--exclude", |
89 | | - metavar="books", |
90 | | - nargs="+", |
91 | | - default=[], |
92 | | - help="The books to exclude; e.g., 'NT', 'OT', 'GEN'. Only used with extract-corpora.", |
| 78 | + "--config", |
| 79 | + help="Path to a configuration file in YAML format. This is used to configure the onboarding process.", |
| 80 | + default=None, |
| 81 | + type=str, |
93 | 82 | ) |
94 | 83 | parser.add_argument( |
95 | | - "--markers", default=False, action="store_true", help="Include USFM markers. Only used with extract-corpora." |
| 84 | + "--overwrite", help="Overwrite any existing files and folders", default=False, action="store_true" |
96 | 85 | ) |
| 86 | + |
97 | 87 | parser.add_argument( |
98 | | - "--lemmas", |
| 88 | + "--extract-corpora", |
99 | 89 | default=False, |
100 | 90 | action="store_true", |
101 | | - help="Extract lemmas if available. Only used with extract-corpora.", |
| 91 | + help="Extract text corpora from the Paratext project.", |
102 | 92 | ) |
| 93 | + |
103 | 94 | parser.add_argument( |
104 | | - "--project-vrefs", |
| 95 | + "--collect-verse-counts", |
105 | 96 | default=False, |
106 | 97 | action="store_true", |
107 | | - help="Extract project verse refs. Only used with extract-corpora.", |
| 98 | + help="Collect various counts from the extracted Paratext project.", |
108 | 99 | ) |
109 | 100 |
|
110 | 101 | args = parser.parse_args() |
111 | | - project_name = args.project |
| 102 | + if not args.project: |
| 103 | + raise ValueError("Project name is required. Please provide a valid Paratext project name using <project>.") |
112 | 104 |
|
113 | | - LOGGER.info(f"Onboarding project: {args.project}") |
114 | | - paratext_project_dir: Path = create_paratext_project_folder_if_not_exists(project_name) |
| 105 | + project_name = args.project |
115 | 106 |
|
116 | 107 | if args.copy_from: |
| 108 | + LOGGER.info(f"Onboarding project: {args.project}") |
| 109 | + paratext_project_dir: Path = create_paratext_project_folder_if_not_exists(project_name) |
117 | 110 | copy_paratext_project_folder(Path(args.copy_from), paratext_project_dir, overwrite=args.overwrite) |
118 | 111 |
|
119 | | - if args.extract_corpora: |
120 | | - from .extract_corpora import extract_corpora |
| 112 | + if args.config: |
| 113 | + config_file = Path(args.config) |
| 114 | + if not config_file.exists(): |
| 115 | + raise FileNotFoundError(f"Config file '{config_file}' does not exist.") |
| 116 | + with config_file.open("r", encoding="utf-8") as file: |
| 117 | + config = yaml.safe_load(file) |
| 118 | + else: |
| 119 | + raise ValueError("Config file is required. Please provide a valid configuration file using --config.") |
121 | 120 |
|
| 121 | + if args.extract_corpora: |
| 122 | + LOGGER.info(f"Extracting {project_name}.") |
122 | 123 | extract_corpora( |
123 | 124 | projects={project_name}, |
124 | | - books_to_include=args.include, |
125 | | - books_to_exclude=args.exclude, |
126 | | - include_markers=args.markers, |
127 | | - extract_lemmas=args.lemmas, |
128 | | - extract_project_vrefs=args.project_vrefs, |
| 125 | + books_to_include=config["extract_corpora"]["include"] if "include" in config["extract_corpora"] else [], |
| 126 | + books_to_exclude=config["extract_corpora"]["exclude"] if "exclude" in config["extract_corpora"] else [], |
| 127 | + include_markers=(config["extract_corpora"]["markers"] if "markers" in config["extract_corpora"] else False), |
| 128 | + extract_lemmas=config["extract_corpora"]["lemmas"] if "lemmas" in config["extract_corpora"] else False, |
| 129 | + extract_project_vrefs=( |
| 130 | + config["extract_corpora"]["project-vrefs"] if "project-vrefs" in config["extract_corpora"] else False |
| 131 | + ), |
| 132 | + ) |
| 133 | + |
| 134 | + if args.collect_verse_counts: |
| 135 | + if not args.extract_corpora: |
| 136 | + LOGGER.warning( |
| 137 | + "--extract_corpora was not included. Collecting verse counts requires the corpus to be extracted first." |
| 138 | + ) |
| 139 | + |
| 140 | + LOGGER.info(f"Collecting verse counts from {project_name}.") |
| 141 | + |
| 142 | + if config["verse_counts"]["output_folder"]: |
| 143 | + output_folder = Path(config["verse_counts"]["output_folder"]) |
| 144 | + if not output_folder.exists(): |
| 145 | + output_folder.mkdir(parents=True, exist_ok=True) |
| 146 | + else: |
| 147 | + output_folder = SIL_NLP_ENV.mt_experiments_dir / "verse_counts" / project_name |
| 148 | + if not output_folder.exists(): |
| 149 | + output_folder.mkdir(parents=True, exist_ok=True) |
| 150 | + input_folder = ( |
| 151 | + config["verse_counts"]["input_folder"] |
| 152 | + if "input_folder" in config["verse_counts"] |
| 153 | + else SIL_NLP_ENV.mt_scripture_dir |
| 154 | + ) |
| 155 | + file_patterns = ( |
| 156 | + config["verse_counts"]["files"] if "files" in config["verse_counts"] else f"*{project_name}*.txt" |
| 157 | + ) |
| 158 | + |
| 159 | + input_folder_path = Path(input_folder) |
| 160 | + if not input_folder_path.exists(): |
| 161 | + LOGGER.error(f"Input folder '{input_folder_path}' does not exist. Skipping verse counts collection.") |
| 162 | + return |
| 163 | + |
| 164 | + matched_files = list(input_folder_path.glob(file_patterns)) |
| 165 | + if not matched_files: |
| 166 | + LOGGER.error( |
| 167 | + f"No files matching pattern '{file_patterns}' found in '{input_folder_path}'. Skipping verse counts collection." |
| 168 | + ) |
| 169 | + return |
| 170 | + |
| 171 | + collect_verse_counts( |
| 172 | + input_folder=input_folder_path, |
| 173 | + output_folder=output_folder, |
| 174 | + file_patterns=file_patterns, |
| 175 | + deutero=config["verse_counts"]["deutero"] if "deutero" in config["verse_counts"] else False, |
| 176 | + recount=config["verse_counts"]["recount"] if "recount" in config["verse_counts"] else False, |
129 | 177 | ) |
130 | 178 |
|
131 | 179 |
|
|
0 commit comments