Skip to content

Commit

Permalink
Fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
johnml1135 committed Feb 21, 2025
1 parent 685f563 commit fd119ea
Show file tree
Hide file tree
Showing 7 changed files with 26 additions and 16 deletions.
2 changes: 1 addition & 1 deletion machine/corpora/zip_paratext_project_terms_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ def _exists(self, file_name: StrPath) -> bool:

def _open(self, file_name: StrPath) -> Optional[BinaryIO]:
if file_name in self._archive.namelist():
return BytesIO(self._archive.read(file_name))
return BytesIO(self._archive.read(str(file_name)))
return None
2 changes: 1 addition & 1 deletion machine/corpora/zip_paratext_project_text_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ def _exists(self, file_name: StrPath) -> bool:

def _open(self, file_name: StrPath) -> Optional[BinaryIO]:
if file_name in self._archive.namelist():
return BytesIO(self._archive.read(file_name))
return BytesIO(self._archive.read(str(file_name)))
return None
20 changes: 12 additions & 8 deletions machine/translation/huggingface/hugging_face_nmt_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,17 +80,21 @@ def __init__(
else:
src_lang_token = src_lang
tgt_lang_token = tgt_lang
if (
src_lang is not None
and src_lang_token not in self._tokenizer.added_tokens_encoder
and src_lang_token not in additional_special_tokens
if src_lang is not None and (
src_lang_token is None
or (
src_lang_token not in self._tokenizer.added_tokens_encoder
and src_lang_token not in additional_special_tokens # type: ignore - we already check for None
)
):
raise ValueError(f"The specified model does not support the language code '{src_lang}'")

if (
tgt_lang is not None
and tgt_lang_token not in self._tokenizer.added_tokens_encoder
and tgt_lang_token not in additional_special_tokens
if tgt_lang is not None and (
tgt_lang_token is None
or (
tgt_lang_token not in self._tokenizer.added_tokens_encoder
and tgt_lang_token not in additional_special_tokens # type: ignore - we already check for None
)
):
raise ValueError(f"The specified model does not support the language code '{tgt_lang}'")

Expand Down
2 changes: 2 additions & 0 deletions machine/translation/huggingface/hugging_face_nmt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ def __init__(self, model: HuggingFaceNmtModel, corpus: Union[ParallelTextCorpus,

def save(self) -> None:
super().save()
if self._model.training_args.output_dir is None:
raise ValueError("Output directory must not be None.")
output_dir = Path(self._model.training_args.output_dir)
if output_dir != self._model._model_path:
shutil.copytree(output_dir, self._model._model_path)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ def train(
check_canceled: Optional[Callable[[], None]] = None,
) -> None:
last_checkpoint = None
if self._training_args.output_dir is None:
raise ValueError("Output directory is not set")
if os.path.isdir(self._training_args.output_dir) and not self._training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(self._training_args.output_dir)
if last_checkpoint is None and any(os.path.isfile(p) for p in os.listdir(self._training_args.output_dir)):
Expand Down Expand Up @@ -176,6 +178,8 @@ def find_missing_characters(tokenizer: Any, train_dataset: Dataset, lang_codes:
return missing_characters

def add_tokens(tokenizer: Any, missing_tokens: List[str]) -> Any:
if self._training_args.output_dir is None:
raise ValueError("Output directory is not set")
tokenizer_dir = Path(self._training_args.output_dir)
tokenizer.save_pretrained(str(tokenizer_dir))
with open(tokenizer_dir / "tokenizer.json", "r+", encoding="utf-8") as file:
Expand Down Expand Up @@ -317,7 +321,7 @@ def preprocess_function(examples):
model=model,
args=self._training_args,
train_dataset=cast(Any, train_dataset),
tokenizer=tokenizer,
processing_class=tokenizer,
data_collator=data_collator,
callbacks=[
_ProgressCallback(
Expand Down
8 changes: 4 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ pytest-cov = "^4.1.0"
ipykernel = "^6.7.0"
jupyter = "^1.0.0"
pandas = "^2.0.3"
pyright = { extras = ["nodejs"], version = "^1.1.362" }
pyright = { extras = ["nodejs"], version = "^1.1.394" }
decoy = "^2.1.0"
pep8-naming = "^0.14.1"

Expand Down

0 comments on commit fd119ea

Please sign in to comment.