Fixes

sillsdev · Feb 21, 2025 · fd119ea · fd119ea
1 parent 685f563
commit fd119ea
Show file tree

Hide file tree

Showing 7 changed files with 26 additions and 16 deletions.
diff --git a/machine/corpora/zip_paratext_project_terms_parser.py b/machine/corpora/zip_paratext_project_terms_parser.py
@@ -19,5 +19,5 @@ def _exists(self, file_name: StrPath) -> bool:
 
     def _open(self, file_name: StrPath) -> Optional[BinaryIO]:
         if file_name in self._archive.namelist():
-            return BytesIO(self._archive.read(file_name))
+            return BytesIO(self._archive.read(str(file_name)))
         return None
diff --git a/machine/corpora/zip_paratext_project_text_updater.py b/machine/corpora/zip_paratext_project_text_updater.py
@@ -18,5 +18,5 @@ def _exists(self, file_name: StrPath) -> bool:
 
     def _open(self, file_name: StrPath) -> Optional[BinaryIO]:
         if file_name in self._archive.namelist():
-            return BytesIO(self._archive.read(file_name))
+            return BytesIO(self._archive.read(str(file_name)))
         return None
diff --git a/machine/translation/huggingface/hugging_face_nmt_engine.py b/machine/translation/huggingface/hugging_face_nmt_engine.py
@@ -80,17 +80,21 @@ def __init__(
             else:
                 src_lang_token = src_lang
                 tgt_lang_token = tgt_lang
-            if (
-                src_lang is not None
-                and src_lang_token not in self._tokenizer.added_tokens_encoder
-                and src_lang_token not in additional_special_tokens
+            if src_lang is not None and (
+                src_lang_token is None
+                or (
+                    src_lang_token not in self._tokenizer.added_tokens_encoder
+                    and src_lang_token not in additional_special_tokens  # type: ignore - we already check for None
+                )
             ):
                 raise ValueError(f"The specified model does not support the language code '{src_lang}'")
 
-            if (
-                tgt_lang is not None
-                and tgt_lang_token not in self._tokenizer.added_tokens_encoder
-                and tgt_lang_token not in additional_special_tokens
+            if tgt_lang is not None and (
+                tgt_lang_token is None
+                or (
+                    tgt_lang_token not in self._tokenizer.added_tokens_encoder
+                    and tgt_lang_token not in additional_special_tokens  # type: ignore - we already check for None
+                )
             ):
                 raise ValueError(f"The specified model does not support the language code '{tgt_lang}'")
 

diff --git a/machine/translation/huggingface/hugging_face_nmt_model.py b/machine/translation/huggingface/hugging_face_nmt_model.py
@@ -89,6 +89,8 @@ def __init__(self, model: HuggingFaceNmtModel, corpus: Union[ParallelTextCorpus,
 
     def save(self) -> None:
         super().save()
+        if self._model.training_args.output_dir is None:
+            raise ValueError("Output directory must not be None.")
         output_dir = Path(self._model.training_args.output_dir)
         if output_dir != self._model._model_path:
             shutil.copytree(output_dir, self._model._model_path)

diff --git a/machine/translation/huggingface/hugging_face_nmt_model_trainer.py b/machine/translation/huggingface/hugging_face_nmt_model_trainer.py
@@ -115,6 +115,8 @@ def train(
         check_canceled: Optional[Callable[[], None]] = None,
     ) -> None:
         last_checkpoint = None
+        if self._training_args.output_dir is None:
+            raise ValueError("Output directory is not set")
         if os.path.isdir(self._training_args.output_dir) and not self._training_args.overwrite_output_dir:
             last_checkpoint = get_last_checkpoint(self._training_args.output_dir)
             if last_checkpoint is None and any(os.path.isfile(p) for p in os.listdir(self._training_args.output_dir)):
@@ -176,6 +178,8 @@ def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes:
             return missing_characters
 
         def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
+            if self._training_args.output_dir is None:
+                raise ValueError("Output directory is not set")
             tokenizer_dir = Path(self._training_args.output_dir)
             tokenizer.save_pretrained(str(tokenizer_dir))
             with open(tokenizer_dir / "tokenizer.json", "r+", encoding="utf-8") as file:
@@ -317,7 +321,7 @@ def preprocess_function(examples):
             model=model,
             args=self._training_args,
             train_dataset=cast(Any, train_dataset),
-            tokenizer=tokenizer,
+            processing_class=tokenizer,
             data_collator=data_collator,
             callbacks=[
                 _ProgressCallback(

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -83,7 +83,7 @@ pytest-cov = "^4.1.0"
 ipykernel = "^6.7.0"
 jupyter = "^1.0.0"
 pandas = "^2.0.3"
-pyright = { extras = ["nodejs"], version = "^1.1.362" }
+pyright = { extras = ["nodejs"], version = "^1.1.394" }
 decoy = "^2.1.0"
 pep8-naming = "^0.14.1"