From db24a440e1a927b03baefa730d56a1cbab22b9b8 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sat, 23 Nov 2024 15:09:46 +0100 Subject: [PATCH 1/5] Add helper function to train fasttext model --- src/langsfer/utils.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/langsfer/utils.py b/src/langsfer/utils.py index 8528f19..4fd740d 100644 --- a/src/langsfer/utils.py +++ b/src/langsfer/utils.py @@ -2,11 +2,13 @@ import os import shutil from pathlib import Path +from typing import Iterable import requests +from gensim.models import FastText from tqdm.auto import tqdm -__all__ = ["download_file"] +__all__ = ["download_file", "train_fasttext_model"] def download_file( @@ -36,3 +38,32 @@ def download_file( shutil.copyfileobj(r_raw, f) return destination_path + + +def train_fasttext_model( + corpus_iterable: Iterable[list[str]], + *, + total_examples: int, + vector_size: int = 300, + window: int = 3, + min_count: int = 10, + epochs: int = 3, +) -> FastText: + """Trains a FastText model. + + Args: + corpus_iterable: Iterator of lists of tokens (tokenized sentences) used as training data. + total_examples : Count of sentences. + vector_size : Dimensionality of the word vectors. + window : The maximum distance between the current and predicted word within a sentence. + min_count : The model ignores all words with total frequency lower than this. + epochs : Number of iterations (epochs) over the corpus. + """ + model = FastText(vector_size=vector_size, window=window, min_count=min_count) + model.build_vocab(corpus_iterable=corpus_iterable) + model.train( + corpus_iterable=corpus_iterable, + total_examples=total_examples, + epochs=epochs, + ) + return model From 8b0a156bb22ae0804d8662840802a45b5316d124 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sat, 23 Nov 2024 15:10:20 +0100 Subject: [PATCH 2/5] Fix docstring --- src/langsfer/embeddings.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/langsfer/embeddings.py b/src/langsfer/embeddings.py index 73f4820..43b67ed 100644 --- a/src/langsfer/embeddings.py +++ b/src/langsfer/embeddings.py @@ -49,10 +49,9 @@ class FastTextEmbeddings(AuxiliaryEmbeddings): """Loads embeddings from a pretrained FastText model from a local path or a url. Args: - model_name_or_path: Name or path of model to load. + model: FastText model. Attributes: - model_name_or_path: Name or path of model. model: FastText model. """ From f304d337eb859c8d85778a222e300f6ba5a9a383 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sat, 23 Nov 2024 15:10:32 +0100 Subject: [PATCH 3/5] Add tutorial notebook for FOCUS --- notebooks/FOCUS_tutorial.ipynb | 890 +++++++++++++++++++++++++++++++++ 1 file changed, 890 insertions(+) create mode 100644 notebooks/FOCUS_tutorial.ipynb diff --git a/notebooks/FOCUS_tutorial.ipynb b/notebooks/FOCUS_tutorial.ipynb new file mode 100644 index 0000000..2fcec42 --- /dev/null +++ b/notebooks/FOCUS_tutorial.ipynb @@ -0,0 +1,890 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FOCUS Tutorial\n", + "\n", + "In this tutorial, we will use Langsfer to transfer a model trained in English to German with the [FOCUS](https://arxiv.org/abs/2305.14481) method, similarily to one of the experiments described in the paper.\n", + "\n", + "FOCUS is a cross-lingual language transfer method that is similar to WECHSEL in that it uses FastText auxiliary embeddings\n", + "but rather than use pre-trained ones, it relies on embeddings that were trained from scratch on pre-tokenized text using the source and target tokenizers.\n", + "\n", + "It is also similar to CLP-Transfer in that it relies on finding overlapping and non-overlapping tokens,\n", + "but rather rely on exact matching it uses fuzzy token matching to determine overlapping tokens.\n", + "\n", + "The method requires as input:\n", + "\n", + "- a tokenizer in the source language,\n", + "- a pre-trained language model in the source language,\n", + "- a tokenizer in the target language,\n", + "- 2 monolingual fastText embeddings for source and target languages respectively.\n", + " They are trained from scratch for both languages using pre-tokenized text with the respective language tokenizer.\n", + "\n", + "For the tutorial, we will use as much as possible the same parameters as described in the paper:\n", + "\n", + "- For the source model and tokenizer, we will use [gpt2-large](openai-community/gpt2-large),\n", + "- For the target tokenizer, we will train one from scratch,\n", + "\n", + "For the sake of brevity, we will however use fewer training samples and steps." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "We begin by importing libraries and setting some defaults." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "hide-cell" + ] + }, + "outputs": [], + "source": [ + "import random\n", + "import warnings\n", + "from typing import Generator\n", + "\n", + "import datasets\n", + "import numpy as np\n", + "import torch\n", + "from transformers import (\n", + " AutoModel,\n", + " AutoTokenizer,\n", + " AutoModelForCausalLM,\n", + " DataCollatorForLanguageModeling,\n", + " TrainingArguments,\n", + " Trainer,\n", + ")\n", + "\n", + "warnings.simplefilter(\"ignore\")\n", + "\n", + "# Constants\n", + "SOURCE_MODEL_NAME = \"openai-community/gpt2-large\"\n", + "DATASET_NAME = \"oscar-corpus/oscar\"\n", + "SOURCE_DATASET_CONFIG_NAME = \"unshuffled_deduplicated_en\"\n", + "TARGET_DATASET_CONFIG_NAME = \"unshuffled_deduplicated_de\"\n", + "DATASET_SIZE = 20000\n", + "TRAIN_DATASET_SIZE = 16000\n", + "TRAIN_BATCH_SIZE = 2\n", + "GRADIENT_ACCUMULATION_STEPS = 64\n", + "EVAL_STEPS = 4000\n", + "MAX_TRAIN_STEPS = 24000\n", + "LEARNING_RATE = 1e-4\n", + "WEIGHT_DECAY = 0.01\n", + "ADAM_EPSILON = 1e-6\n", + "ADAM_BETA1 = 0.9\n", + "ADAM_BETA2 = 0.98\n", + "SEED = 16\n", + "\n", + "random.seed(SEED)\n", + "np.random.seed(SEED)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use the following functions and classes from Langsfer." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "%autoreload\n", + "from langsfer.high_level import focus\n", + "from langsfer.embeddings import FastTextEmbeddings\n", + "from langsfer.utils import train_fasttext_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset\n", + "\n", + "We use the [datasets](https://huggingface.co/docs/datasets/index) library to load the [oscar](https://huggingface.co/datasets/oscar-corpus/oscar), which stands for **O**pen **S**uper-large **C**rawled **A**LMAnaCH co**R**pus, dataset's german configuration and then take a limited number of samples from it for training and validation." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = datasets.load_dataset(\n", + " DATASET_NAME,\n", + " TARGET_DATASET_CONFIG_NAME,\n", + " split=\"train\",\n", + " streaming=True,\n", + " trust_remote_code=True,\n", + ")\n", + "dataset = dataset.shuffle(seed=SEED)\n", + "dataset = dataset.take(DATASET_SIZE)\n", + "train_dataset = dataset.take(TRAIN_DATASET_SIZE)\n", + "val_dataset = dataset.skip(TRAIN_DATASET_SIZE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We take sample text from the validation set in order to compare tokenization between source and target tokenizers as well as for evaluating the generation of our trained model at the end. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester (Julia Jentsch) Angst… Doppelbödig.\n", + "in der rechten Armbeuge beim Öffnen des Mehlsilos zur Rettung der Bäckerstocher, welch ein Regiefehler! Der tiefergehende Sinn des Falles wird ansonsten auch nicht klar. Wirkt leider alles etwas zusammengeschustert.\n", + "Wer spielte die Hauptrolle in Film \"The International\" und wurde als potenzieller James Bond-Nachfolger gehandelt?\n" + ] + } + ], + "source": [ + "sample_text = list(val_dataset.skip(10).take(1))[0][\"text\"]\n", + "print(sample_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embeddings and Tokenizers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We load the source tokenizer as well as the source model and extract the input embeddings matrix from it." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [ + "remove-output" + ] + }, + "outputs": [], + "source": [ + "source_tokenizer = AutoTokenizer.from_pretrained(SOURCE_MODEL_NAME)\n", + "source_model = AutoModel.from_pretrained(SOURCE_MODEL_NAME)\n", + "source_embeddings_matrix = source_model.get_input_embeddings().weight.detach().numpy()\n", + "del source_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use the source tokenizer to convert the sample text to tokens." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of tokens 172, tokens: ['mit', 'ĠEva', 'ĠMatt', 'es', 'Ġal', 's', 'ĠKl', 'ara', 'ĠBl', 'um', '.', 'ĠE', 'ine', 'ĠB', 'ä', 'cker', 'st', 'och', 'ter', 'Ġstir', 'bt', 'Ġin', 'Ġder', 'ĠBack', 'r', 'ö', 'h', 're', '.', 'ĠJet', 'z', 't', 'Ġhat', 'Ġi', 'h', 're', 'ĠSchw', 'ester', 'Ġ(', 'Jul', 'ia', 'ĠJ', 'ents', 'ch', ')', 'ĠAng', 'st', 'âĢ¦', 'ĠDo', 'ppel', 'b', 'ö', 'dig', '.', 'Ċ', 'in', 'Ġder', 'Ġre', 'ch', 'ten', 'ĠArm', 'be', 'uge', 'Ġbe', 'im', 'ĠÃĸ', 'ff', 'nen', 'Ġdes', 'ĠMeh', 'ls', 'il', 'os', 'Ġz', 'ur', 'ĠR', 'ett', 'ung', 'Ġder', 'ĠB', 'ä', 'cker', 'st', 'oc', 'her', ',', 'Ġwel', 'ch', 'Ġe', 'in', 'ĠReg', 'ief', 'eh', 'ler', '!', 'ĠDer', 'Ġt', 'ief', 'er', 'ge', 'hend', 'e', 'ĠSinn', 'Ġdes', 'ĠFall', 'es', 'Ġw', 'ird', 'Ġan', 'son', 'sten', 'Ġa', 'uch', 'Ġn', 'icht', 'Ġk', 'lar', '.', 'ĠW', 'irk', 't', 'Ġle', 'ider', 'Ġall', 'es', 'Ġet', 'was', 'Ġz', 'us', 'amm', 'enges', 'ch', 'ust', 'ert', '.', 'Ċ', 'W', 'er', 'Ġsp', 'iel', 'te', 'Ġdie', 'ĠHau', 'pt', 'rol', 'le', 'Ġin', 'ĠFilm', 'Ġ\"', 'The', 'ĠInternational', '\"', 'Ġund', 'Ġw', 'urd', 'e', 'Ġal', 's', 'Ġpot', 'enzie', 'ller', 'ĠJames', 'ĠBond', '-', 'N', 'ach', 'fol', 'ger', 'Ġge', 'hand', 'elt', '?']\n" + ] + } + ], + "source": [ + "tokens = source_tokenizer.tokenize(sample_text)\n", + "print(f\"Number of tokens {len(tokens)}, tokens: {tokens}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We train a new target tokenizer using the same configuration as the source tokenizer using the training dataset " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "def batch_iterator(\n", + " dataset: datasets.Dataset, batch_size: int = 1000\n", + ") -> Generator[str, None, None]:\n", + " for batch in dataset.iter(batch_size=batch_size):\n", + " yield batch[\"text\"]\n", + "\n", + "\n", + "target_tokenizer = source_tokenizer.train_new_from_iterator(\n", + " batch_iterator(train_dataset), vocab_size=len(source_tokenizer)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then use the target tokenizer to convert the sample text to tokens and notice that the conversion creates fewer tokens than previously." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of tokens 106, tokens: ['mit', 'ĠEva', 'ĠMatt', 'es', 'Ġals', 'ĠKl', 'ara', 'ĠBlum', '.', 'ĠEine', 'ĠBäcker', 'stochter', 'Ġstirbt', 'Ġin', 'Ġder', 'ĠBack', 'röhre', '.', 'ĠJetzt', 'Ġhat', 'Ġihre', 'ĠSchwester', 'Ġ(', 'Julia', 'ĠJ', 'ent', 'sch', ')', 'ĠAngst', 'âĢ¦', 'ĠDoppel', 'bö', 'dig', '.', 'Ċ', 'in', 'Ġder', 'Ġrechten', 'ĠArmb', 'euge', 'Ġbeim', 'ĠÃĸffnen', 'Ġdes', 'ĠMehl', 'sil', 'os', 'Ġzur', 'ĠRettung', 'Ġder', 'ĠBäcker', 'st', 'ocher', ',', 'Ġwelch', 'Ġein', 'ĠReg', 'ief', 'ehler', '!', 'ĠDer', 'Ġtiefer', 'gehende', 'ĠSinn', 'Ġdes', 'ĠFall', 'es', 'Ġwird', 'Ġansonsten', 'Ġauch', 'Ġnicht', 'Ġklar', '.', 'ĠWir', 'kt', 'Ġleider', 'Ġalles', 'Ġetwas', 'Ġzusammen', 'gesch', 'uster', 't', '.', 'Ċ', 'Wer', 'Ġspielte', 'Ġdie', 'ĠHauptrolle', 'Ġin', 'ĠFilm', 'Ġ\"', 'The', 'ĠInternational', '\"', 'Ġund', 'Ġwurde', 'Ġals', 'Ġpoten', 'ziel', 'ler', 'ĠJames', 'ĠBond', '-', 'Nach', 'folger', 'Ġgehandelt', '?']\n" + ] + } + ], + "source": [ + "tokens = target_tokenizer.tokenize(sample_text)\n", + "print(f\"Number of tokens {len(tokens)}, tokens: {tokens}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## FastText Embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "source_train_dataset = (\n", + " datasets.load_dataset(\n", + " DATASET_NAME,\n", + " SOURCE_DATASET_CONFIG_NAME,\n", + " split=\"train\",\n", + " streaming=True,\n", + " trust_remote_code=True,\n", + " )\n", + " .shuffle(seed=SEED)\n", + " .take(DATASET_SIZE)\n", + ")\n", + "source_train_tokenized_dataset = source_train_dataset.map(\n", + " lambda x: source_tokenizer(x[\"text\"], truncation=True),\n", + " batched=True,\n", + " remove_columns=dataset.column_names,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "target_train_tokenized_dataset = train_dataset.map(\n", + " lambda x: target_tokenizer(x[\"text\"], truncation=True),\n", + " batched=True,\n", + " remove_columns=dataset.column_names,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then train auxiliary fasttext embeddings based on tokenized text using the source and target tokenizers." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "source_fasttext_model = train_fasttext_model(\n", + " source_train_dataset, total_examples=TRAIN_DATASET_SIZE\n", + ")\n", + "source_auxiliary_embeddings = FastTextEmbeddings(source_fasttext_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "target_fasttext_model = train_fasttext_model(\n", + " train_dataset, total_examples=TRAIN_DATASET_SIZE\n", + ")\n", + "target_auxiliary_embeddings = FastTextEmbeddings(target_fasttext_model)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Embedding Initialization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We finally, instantiate the embedding initializer for FOCUS" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_initializer = focus(\n", + " source_tokenizer=source_tokenizer,\n", + " source_embeddings_matrix=source_embeddings_matrix,\n", + " target_tokenizer=target_tokenizer,\n", + " target_auxiliary_embeddings=target_auxiliary_embeddings,\n", + " source_auxiliary_embeddings=source_auxiliary_embeddings,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then initialize the target embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b4d885dd7d254257a548f91c253ce318", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Non-Overlapping Tokens: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "target_embeddings_matrix = embedding_initializer.initialize(seed=16, show_progress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have the initialized embeddings matrix, we can use it to replace the embeddings matrix in the source model. " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "tags": [ + "remove-output" + ] + }, + "outputs": [], + "source": [ + "target_model_wechsel = AutoModelForCausalLM.from_pretrained(SOURCE_MODEL_NAME)\n", + "\n", + "# Resize its embedding layer\n", + "target_model_wechsel.resize_token_embeddings(len(target_tokenizer))\n", + "\n", + "# Replace the source embeddings matrix with the target embeddings matrix\n", + "target_model_wechsel.get_input_embeddings().weight.data = torch.as_tensor(\n", + " target_embeddings_matrix\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> We used `AutoModelForCausalLM` instead of `AutoModel` because we will train the newly initialized model for causal language modelling." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset preprocessing\n", + "\n", + "Before training, we must preprocess the training and validation sets by tokenizing the text, removing all other columns and then converting the resulting arrays to PyTorch tensors." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = train_dataset.map(\n", + " lambda x: target_tokenizer(x[\"text\"], truncation=True),\n", + " batched=True,\n", + " remove_columns=dataset.column_names,\n", + ")\n", + "train_dataset = train_dataset.with_format(\"torch\")\n", + "\n", + "val_dataset = val_dataset.map(\n", + " lambda x: target_tokenizer(x[\"text\"], truncation=True),\n", + " batched=True,\n", + " remove_columns=dataset.column_names,\n", + ")\n", + "val_dataset = val_dataset.with_format(\"torch\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define the training parameters and instantiate a [Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer) object." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "max_steps is given, it will override any value given in num_train_epochs\n" + ] + } + ], + "source": [ + "data_collator = DataCollatorForLanguageModeling(tokenizer=target_tokenizer, mlm=False)\n", + "\n", + "training_args = TrainingArguments(\n", + " output_dir=\"/tmp/wechsel\",\n", + " eval_strategy=\"steps\",\n", + " report_to=\"tensorboard\",\n", + " eval_steps=EVAL_STEPS // GRADIENT_ACCUMULATION_STEPS,\n", + " max_steps=MAX_TRAIN_STEPS // GRADIENT_ACCUMULATION_STEPS,\n", + " per_device_train_batch_size=TRAIN_BATCH_SIZE,\n", + " gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n", + " learning_rate=LEARNING_RATE,\n", + " weight_decay=WEIGHT_DECAY,\n", + " adam_epsilon=ADAM_EPSILON,\n", + " adam_beta1=ADAM_BETA1,\n", + " adam_beta2=ADAM_BETA2,\n", + " bf16=True,\n", + ")\n", + "\n", + "if target_tokenizer.pad_token is None:\n", + " target_tokenizer.pad_token = target_tokenizer.eos_token\n", + "\n", + "trainer = Trainer(\n", + " model=target_model_wechsel,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=val_dataset,\n", + " data_collator=data_collator,\n", + " tokenizer=target_tokenizer,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We evaluate the model before training by computing the average loss on the entire training set" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluation loss before training: 10.399\n" + ] + } + ], + "source": [ + "eval_loss = trainer.evaluate()[\"eval_loss\"]\n", + "print(f\"Evaluation loss before training: {eval_loss:.3f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and then we take the sample text, truncate it and then make the initialized model generate a completion for it " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n", + "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original Text:\n", + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester (Julia Jentsch) Angst… Doppelbödig.\n", + "in der rechten Armbeuge beim Öffnen des Mehlsilos zur Rettung der Bäckerstocher, welch ein Regiefehler! Der tiefergehende Sinn des Falles wird ansonsten auch nicht klar. Wirkt leider alles etwas zusammengeschustert.\n", + "Wer spielte die Hauptrolle in Film \"The International\" und wurde als potenzieller James Bond-Nachfolger gehandelt?\n", + "---\n", + "Shortened Text:\n", + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester\n", + "---\n", + "Generated Text:\n", + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester, jemanden sich nicht zu einer ganzen Weltkrieg und die Verwendung von den Geburtstag auf dem Königin erfolgt werden kann:\n", + "The German-born American who was the first person to be on Mars and is now an international hero of human rights has been made into one by his own people for using \"the most extreme form\" possible when he used it as part of their war with Russia over Ukraine's Donbas region last week (see below). The story that we are about to read will make you want more than ever what can only have once been our national ideal – not just because this man did so much good but also how far Germany would go if they were free from all these other bad men around them! And then there is another side too…<|endoftext|>A new book out Monday details some very personal information — like where your ex or bestie might live at any moment— provided through Facebook Messenger while dating apps such as Grindr and Scrotumz let us know which girls may still see eachother online even after she/he no longs for her backside. But according to New York Magazine , social media data collection via mobile app BBM means its userbase is bigger than Twitter . That number? More than 1bn per day; 2\n" + ] + } + ], + "source": [ + "sample_input_ids = target_tokenizer(sample_text)[\"input_ids\"]\n", + "shortened_input_ids = sample_input_ids[: len(sample_input_ids) // 3 - 13]\n", + "shortened_text = target_tokenizer.decode(shortened_input_ids, add_special_tokens=False)\n", + "\n", + "generated_token_ids = (\n", + " trainer.model.generate(\n", + " torch.as_tensor(shortened_input_ids).reshape(1, -1).to(trainer.model.device),\n", + " max_length=300,\n", + " min_length=10,\n", + " top_p=0.9,\n", + " temperature=0.9,\n", + " repetition_penalty=2.0,\n", + " )\n", + " .detach()\n", + " .cpu()\n", + " .numpy()\n", + " .reshape(-1)\n", + ")\n", + "generated_tokens = target_tokenizer.decode(\n", + " generated_token_ids, add_special_tokens=False\n", + ")\n", + "print(\"Original Text:\")\n", + "print(sample_text)\n", + "print(\"---\")\n", + "print(\"Shortened Text:\")\n", + "print(shortened_text)\n", + "print(\"---\")\n", + "print(\"Generated Text:\")\n", + "print(generated_tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then train the model" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [375/375 4:48:06, Epoch 5/9223372036854775807]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining LossValidation Loss
62No log6.410900
124No log6.019698
186No log5.782487
248No log5.624911
310No log5.540668
372No log5.507936

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=375, training_loss=6.1109830729166665, metrics={'train_runtime': 17329.4418, 'train_samples_per_second': 5.54, 'train_steps_per_second': 0.022, 'total_flos': 2.917360387347456e+17, 'train_loss': 6.1109830729166665, 'epoch': 5.166666666666667})" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We finally repeat the model evaluationg after the training" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluation loss after training: 5.508\n" + ] + } + ], + "source": [ + "eval_loss = trainer.evaluate()[\"eval_loss\"]\n", + "print(f\"Evaluation loss after training: {eval_loss:.3f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original Text:\n", + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester (Julia Jentsch) Angst… Doppelbödig.\n", + "in der rechten Armbeuge beim Öffnen des Mehlsilos zur Rettung der Bäckerstocher, welch ein Regiefehler! Der tiefergehende Sinn des Falles wird ansonsten auch nicht klar. Wirkt leider alles etwas zusammengeschustert.\n", + "Wer spielte die Hauptrolle in Film \"The International\" und wurde als potenzieller James Bond-Nachfolger gehandelt?\n", + "---\n", + "Shortened Text:\n", + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester\n", + "---\n", + "Generated Text:\n", + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester eine kleine Rolle, die sich mit dem Mann und den anderen Menschen zu einem kleinen Tag auf einen Blick machen können!\n", + "Die beiden Schüler ist ein paar Minuten lang wieder im Jahr von einer Hand aus ihren Augen an seinen ersten Mal gestickert werden - das war es nicht so gut wie man kann... Die meisten schliessen sie auch mal nach Hause.... Ich habe mir schon sehr viel Zeit für mich (und ich bin ja noch nur) aber da muss ihr jetzt etwas ganz einfach sein!! Und dann gibt's meine Frau bei uns!!! :) ))))))))))( !!!!!)??????????!?!??)<3 <-->=(([^_]|^^;:;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;:: ::::::::::::: ::== =-__+----->++---------. |/\\ / \\ _` `' '''') . || [ ] + >+++ +++ ++ * --- ** *** ----** ----- ------* --.--.<--><.. .. ... ......... #..............#.......######@######################## @######################## ,,,.,,.~.-.:&%$ % $ & ~{} { } \"\" ).).),.));()):.;-)>
*******~~***********~~***********~~***********~~***********~~***********~~***********~~***********~~***********~~***********~~********\n" + ] + } + ], + "source": [ + "sample_input_ids = target_tokenizer(sample_text)[\"input_ids\"]\n", + "shortened_input_ids = sample_input_ids[: len(sample_input_ids) // 3 - 13]\n", + "shortened_text = target_tokenizer.decode(shortened_input_ids, add_special_tokens=False)\n", + "\n", + "generated_token_ids = (\n", + " trainer.model.generate(\n", + " torch.as_tensor(shortened_input_ids).reshape(1, -1).to(trainer.model.device),\n", + " max_length=300,\n", + " min_length=10,\n", + " top_p=0.9,\n", + " temperature=0.9,\n", + " repetition_penalty=2.0,\n", + " )\n", + " .detach()\n", + " .cpu()\n", + " .numpy()\n", + " .reshape(-1)\n", + ")\n", + "generated_tokens = target_tokenizer.decode(\n", + " generated_token_ids, add_special_tokens=False\n", + ")\n", + "print(\"Original Text:\")\n", + "print(sample_text)\n", + "print(\"---\")\n", + "print(\"Shortened Text:\")\n", + "print(shortened_text)\n", + "print(\"---\")\n", + "print(\"Generated Text:\")\n", + "print(generated_tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The generated text's quality is not bad but the model needs further training on more data.\n", + "This was just done for the sake of the tutorial and is not meant to be a full-blown model training." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary\n", + "\n", + "In this tutorial, we have seen how to use FOCUS in order to transfer a pre-trained language model to a new language." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "langsfer-I0oHYpHZ-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From a3567d573b591d0c15e7d02a6be00ffe30ceee10 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sat, 23 Nov 2024 15:10:41 +0100 Subject: [PATCH 4/5] Update readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 170327c..094c500 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,8 @@ pip install . The following notebooks serve as tutorials for users of the package: - [WECHSEL Tutorial](notebooks/WECHSEL_tutorial.ipynb) -- [CLP-Transfer Tutorial](notebooks/CLP_Transfer_tutorial.ipynb) +- [CLP-Transfer Tutorial](notebooks/CLPT_tutorial.ipynb) +- [FOCUS Tutorial](notebooks/FOCUS_tutorial.ipynb) ### Example From 4241d6e2bd0c8277343b98a48ed28c7b50155e5a Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Sat, 23 Nov 2024 15:13:22 +0100 Subject: [PATCH 5/5] Update contributing guide --- CONTRIBUTING.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 09687e9..f51fd90 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,5 +1,6 @@ # Quick Start +- Install [Git LFS](https://github.com/git-lfs/git-lfs) - Install Python 3.10 (Consider using [pyenv](https://github.com/pyenv/pyenv) for that). - Install [Poetry](https://python-poetry.org). - Create virtual environment and install dependencies: