From c775057d883396e9e3e0e558a09d013863a0f655 Mon Sep 17 00:00:00 2001
From: AnesBenmerzoug <anes.benmerzoug@gmail.com>
Date: Thu, 14 Nov 2024 20:29:52 +0100
Subject: [PATCH 1/4] Adapt weight average initialization to clp-transfer

---
 src/langsfer/initialization.py | 122 +++++++++++++++++++++------------
 1 file changed, 78 insertions(+), 44 deletions(-)

diff --git a/src/langsfer/initialization.py b/src/langsfer/initialization.py
index 135c807..ac62ca8 100644
--- a/src/langsfer/initialization.py
+++ b/src/langsfer/initialization.py
@@ -88,29 +88,6 @@ def initialize(
     ) -> NDArray:
         rng = np.random.default_rng(seed)
 
-        # Map source and target subword tokens to auxiliary token space
-        source_subword_embeddings = self._map_tokens_into_embedding_space(
-            self.source_tokenizer,
-            self.source_auxiliary_embeddings,
-        )
-        target_subword_embeddings = self._map_tokens_into_embedding_space(
-            self.target_tokenizer,
-            self.target_auxiliary_embeddings,
-        )
-
-        # Align source to target
-        source_subword_embeddings = self.alignment_strategy.apply(
-            source_subword_embeddings
-        )
-
-        # TODO: investigate why this is needed
-        source_subword_embeddings /= (
-            np.linalg.norm(source_subword_embeddings, axis=1)[:, np.newaxis] + 1e-8
-        )
-        target_subword_embeddings /= (
-            np.linalg.norm(target_subword_embeddings, axis=1)[:, np.newaxis] + 1e-8
-        )
-
         # Initialize target embeddings as random
         target_embeddings_matrix = rng.normal(
             np.mean(self.source_embeddings_matrix, axis=0),
@@ -125,37 +102,97 @@ def initialize(
         overlapping_tokens, non_overlapping_tokens = self.token_overlap_strategy.apply(
             self.source_tokenizer, self.target_tokenizer
         )
+        overlapping_source_token_ids = list(
+            self.source_tokenizer.convert_tokens_to_ids(overlapping_tokens)
+        )
+        overlapping_target_token_ids = list(
+            self.target_tokenizer.convert_tokens_to_ids(overlapping_tokens)
+        )
+        non_overlapping_target_token_ids = list(
+            self.target_tokenizer.convert_tokens_to_ids(non_overlapping_tokens)
+        )
 
         # Copy overlapping token embedding vectors
-        for token in tqdm(
-            overlapping_tokens, desc="Overlapping Tokens", disable=not show_progress
-        ):
-            source_token_id = self.source_tokenizer.convert_tokens_to_ids(token)
-            target_token_id = self.target_tokenizer.convert_tokens_to_ids(token)
-            target_embeddings_matrix[target_token_id] = self.source_embeddings_matrix[
-                source_token_id
-            ]
+        # shape of assigned: (n_target_tokens, n_overlapping_tokens)
+        target_embeddings_matrix[overlapping_target_token_ids] = (
+            self.source_embeddings_matrix[overlapping_source_token_ids]
+        )
 
         # Compute target embedding vectors of non overlapping tokens
         # as weighted average of source tokens
+        target_embeddings_matrix[non_overlapping_target_token_ids] = (
+            self._compute_non_overlapping_token_embeddings(
+                overlapping_source_token_ids=overlapping_source_token_ids,
+                overlapping_target_token_ids=overlapping_target_token_ids,
+                non_overlapping_target_token_ids=non_overlapping_target_token_ids,
+                show_progress=show_progress,
+            )
+        )
+        return target_embeddings_matrix
 
-        non_overlapping_token_ids = list(
-            sorted(self.target_tokenizer.convert_tokens_to_ids(non_overlapping_tokens))
+    def _compute_non_overlapping_token_embeddings(
+        self,
+        overlapping_target_token_ids: list[int],
+        overlapping_source_token_ids: list[int],
+        non_overlapping_target_token_ids: list[int],
+        *,
+        show_progress: bool = False,
+    ) -> NDArray:
+        # Map source and target subword tokens to auxiliary token space
+        target_subword_embeddings = self._map_tokens_into_auxiliary_embedding_space(
+            self.target_tokenizer,
+            self.target_auxiliary_embeddings,
+        )
+        # TODO: investigate why this is needed
+        target_subword_embeddings /= (
+            np.linalg.norm(target_subword_embeddings, axis=1)[:, np.newaxis] + 1e-8
         )
 
+        if self.source_auxiliary_embeddings is None:
+            reference_subword_embeddings = target_subword_embeddings[
+                overlapping_target_token_ids
+            ].copy()
+            source_embeddings_matrix = self.source_embeddings_matrix[
+                overlapping_source_token_ids
+            ]
+        else:
+            reference_subword_embeddings = (
+                self._map_tokens_into_auxiliary_embedding_space(
+                    self.source_tokenizer,
+                    self.source_auxiliary_embeddings,
+                )
+            )
+
+            # Align source to target
+            reference_subword_embeddings = self.alignment_strategy.apply(
+                reference_subword_embeddings
+            )
+
+            # TODO: investigate why this is needed
+            reference_subword_embeddings /= (
+                np.linalg.norm(reference_subword_embeddings, axis=1)[:, np.newaxis]
+                + 1e-8
+            )
+
+            source_embeddings_matrix = self.source_embeddings_matrix
+
+        # Compute target embedding vectors of non overlapping tokens
+        # as weighted average of source tokens
+        target_embedding_vec_batches = []
+
         for token_batch_ids in tqdm(
-            chunked(non_overlapping_token_ids, self.batch_size),
+            chunked(non_overlapping_target_token_ids, self.batch_size),
             desc="Non-Overlapping Tokens",
             disable=not show_progress,
         ):
             # Compute similarities
-            # shape: (batch_size, n_source_tokens)
+            # shape: (batch_size, n_reference_embeddings)
             similarities = self.similarity_strategy.apply(
                 target_subword_embeddings[token_batch_ids],
-                source_subword_embeddings,
+                reference_subword_embeddings,
             )
             # compute weights
-            # shape: (batch_size, n_source_tokens)
+            # shape: (batch_size, n_reference_embeddings)
             weights = self.weights_strategy.apply(similarities)
 
             # weighted average of source model's overlapping token embeddings
@@ -164,17 +201,14 @@ def initialize(
             weights_row_sum = weights.sum(axis=1)
             # shape: (batch_size, source_embedding_dim)
             non_overlapping_embedding_vectors = (
-                weights @ self.source_embeddings_matrix / weights_row_sum[:, np.newaxis]
+                weights @ source_embeddings_matrix / weights_row_sum[:, np.newaxis]
             )
 
-            target_embeddings_matrix[token_batch_ids] = (
-                non_overlapping_embedding_vectors
-            )
-
-        return target_embeddings_matrix
+            target_embedding_vec_batches.append(non_overlapping_embedding_vectors)
+        return np.concatenate(target_embedding_vec_batches, axis=0)
 
     @staticmethod
-    def _map_tokens_into_embedding_space(
+    def _map_tokens_into_auxiliary_embedding_space(
         tokenizer: PreTrainedTokenizerBase,
         embeddings: AuxiliaryEmbeddings,
     ) -> NDArray:

From 694b76cbb89bb88cffd57f177f50cb365127d4d8 Mon Sep 17 00:00:00 2001
From: AnesBenmerzoug <anes.benmerzoug@gmail.com>
Date: Thu, 14 Nov 2024 20:30:21 +0100
Subject: [PATCH 2/4] Improve high level functions docstrings

---
 src/langsfer/high_level.py | 47 ++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/src/langsfer/high_level.py b/src/langsfer/high_level.py
index 8c15214..d7d8d47 100644
--- a/src/langsfer/high_level.py
+++ b/src/langsfer/high_level.py
@@ -60,16 +60,16 @@ def wechsel(
         - trainining fastText embeddings from scratch.
 
     Args:
-        source_tokenizer: Source model's tokenizer
-        source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer
-        target_tokenizer: Target model's tokenizer
-        target_auxiliary_embeddings:
-        source_auxiliary_embeddings:
+        source_tokenizer: Source model's tokenizer.
+        source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer.
+        target_tokenizer: Target model's tokenizer.
+        target_auxiliary_embeddings: FastText auxiliary embeddings in the target language.
+        source_auxiliary_embeddings: FastText auxiliary embeddings in the source language.
         bilingual_dictionary: Dictionary mapping words in source language to words in target language.
-        bilingual_dictionary_file: Path to a bilingual dictionary file
-        temperature: Softmax temperature to apply for weight computation
-        k: Number of closest / most similar tokens to consider for weight computation
-        batch_size: Size of the batches of non-overlapping token computations
+        bilingual_dictionary_file: Path to a bilingual dictionary file.
+        temperature: Softmax temperature to apply for weight computation.
+        k: Number of closest / most similar tokens to consider for weight computation.
+        batch_size: Size of the batches of non-overlapping token computations.
     """
     embeddings_initializer = WeightedAverageEmbeddingsInitialization(
         source_tokenizer=source_tokenizer,
@@ -105,12 +105,19 @@ def clp_transfer(
 
     Described in [CLP-Transfer: Efficient language model training through cross-lingual and progressive transfer learning.](https://arxiv.org/abs/2301.09626) Ostendorff, Malte, and Georg Rehm. arXiv preprint arXiv:2301.09626 (2023).
 
+    The method requires as input:
+
+    - a tokenizer in the source language,
+    - a pre-trained language model in the source language,
+    - a tokenizer in the target language,
+    - a helper pre-trained language model in the target language.
+
     Args:
-        source_tokenizer: Source model's tokenizer
-        source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer
-        target_tokenizer: Target model's tokenizer
-        target_auxiliary_embeddings:
-        batch_size: Size of the batches of non-overlapping token computations
+        source_tokenizer: Source model's tokenizer.
+        source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer.
+        target_tokenizer: Target model's tokenizer.
+        target_auxiliary_embeddings: Auxiliary embeddingsin the target language.
+        batch_size: Size of the batches of non-overlapping token computations.
     """
     embeddings_initializer = WeightedAverageEmbeddingsInitialization(
         source_tokenizer=source_tokenizer,
@@ -140,12 +147,12 @@ def focus(
     Described in [FOCUS: Effective Embedding Initialization for Specializing Pretrained Multilingual Models on a Single Language.](https://arxiv.org/abs/2305.14481) Dobler, Konstantin, and Gerard de Melo. arXiv preprint arXiv:2305.14481 (2023).
 
     Args:
-        source_tokenizer: Source model's tokenizer
-        source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer
-        target_tokenizer: Target model's tokenizer
-        target_auxiliary_embeddings:
-        source_auxiliary_embeddings:
-        batch_size: Size of the batches of non-overlapping token computations
+        source_tokenizer: Source model's tokenizer.
+        source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer.
+        target_tokenizer: Target model's tokenizer.
+        target_auxiliary_embeddings: FastText auxiliary embeddings in the target language.
+        source_auxiliary_embeddings: FastText auxiliary embeddings in the source language.
+        batch_size: Size of the batches of non-overlapping token computations.
     """
     embeddings_initializer = WeightedAverageEmbeddingsInitialization(
         source_tokenizer=source_tokenizer,

From 0f5a6ffb677023b54c786e8f09a6900b8164745f Mon Sep 17 00:00:00 2001
From: AnesBenmerzoug <anes.benmerzoug@gmail.com>
Date: Fri, 15 Nov 2024 08:00:39 +0100
Subject: [PATCH 3/4] Add tutorial notebook for CLP-Transfer

---
 notebooks/CLPT_tutorial.ipynb | 743 ++++++++++++++++++++++++++++++++++
 1 file changed, 743 insertions(+)
 create mode 100644 notebooks/CLPT_tutorial.ipynb

diff --git a/notebooks/CLPT_tutorial.ipynb b/notebooks/CLPT_tutorial.ipynb
new file mode 100644
index 0000000..b0705e7
--- /dev/null
+++ b/notebooks/CLPT_tutorial.ipynb
@@ -0,0 +1,743 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# CLP-Transfer Tutorial\n",
+    "\n",
+    "In this tutorial, we will use Langsfer to transfer a model trained in English to German with the [CLP-Transfer](https://arxiv.org/abs/2301.09626) method, similarily to one of the experiments described in the paper.\n",
+    "\n",
+    "Cross-Lingual and Progressive Transfer, or CLP-Transfer for short, is another cross-lingual language transfer method that efficiently initializes the embedding parameters of a language model in a target language using the embedding parameters from an existing model in a source language as well as the embedding parameters of a helper model in the target language.\n",
+    "\n",
+    "The method requires as input:\n",
+    "\n",
+    "- a tokenizer in the source language,\n",
+    "- a pre-trained language model in the source language,\n",
+    "- a tokenizer in the target language,\n",
+    "- a helper pre-trained language model in the target language.\n",
+    "\n",
+    "For the tutorial, we will use as much as possible the same parameters as described in the paper:\n",
+    "\n",
+    "- For the source model and tokenizer, we will use [gpt2-large](openai-community/gpt2-large),\n",
+    "- For the helper model and target tokenizer, we will use [benjamin/gpt2-wechsel-german](https://huggingface.co/benjamin/gpt2-wechsel-german).\n",
+    "\n",
+    "For the sake of brevity, we will however use fewer training samples and steps."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup\n",
+    "\n",
+    "We begin by importing libraries and setting some defaults."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "tags": [
+     "remove-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%load_ext tensorboard"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "tags": [
+     "hide-cell"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import random\n",
+    "import warnings\n",
+    "\n",
+    "import datasets\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from transformers import (\n",
+    "    AutoModel,\n",
+    "    AutoTokenizer,\n",
+    "    AutoModelForCausalLM,\n",
+    "    DataCollatorForLanguageModeling,\n",
+    "    TrainingArguments,\n",
+    "    Trainer,\n",
+    ")\n",
+    "\n",
+    "warnings.simplefilter(\"ignore\")\n",
+    "\n",
+    "# Constants\n",
+    "SOURCE_MODEL_NAME = \"openai-community/gpt2-large\"\n",
+    "HELPER_MODEL_NAME = \"benjamin/gpt2-wechsel-german\"\n",
+    "DATASET_NAME = \"oscar-corpus/oscar\"\n",
+    "DATASET_CONFIG_NAME = \"unshuffled_deduplicated_de\"\n",
+    "DATASET_SIZE = 20000\n",
+    "TRAIN_DATASET_SIZE = 16000\n",
+    "TRAIN_BATCH_SIZE = 2\n",
+    "GRADIENT_ACCUMULATION_STEPS = 64\n",
+    "EVAL_STEPS = 4000\n",
+    "MAX_TRAIN_STEPS = 48000\n",
+    "LEARNING_RATE = 1e-4\n",
+    "WEIGHT_DECAY = 0.01\n",
+    "ADAM_EPSILON = 1e-6\n",
+    "ADAM_BETA1 = 0.9\n",
+    "ADAM_BETA2 = 0.98\n",
+    "SEED = 16\n",
+    "\n",
+    "random.seed(SEED)\n",
+    "np.random.seed(SEED)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use the following functions and classes from Langsfer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%autoreload\n",
+    "from langsfer.high_level import clp_transfer\n",
+    "from langsfer.embeddings import TransformersEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Dataset\n",
+    "\n",
+    "We use the [datasets](https://huggingface.co/docs/datasets/index) library to load the [oscar](https://huggingface.co/datasets/oscar-corpus/oscar), which stands for **O**pen **S**uper-large **C**rawled **A**LMAnaCH co**R**pus, dataset's german configuration and then take a limited number of samples from it for training and validation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = datasets.load_dataset(\n",
+    "    DATASET_NAME,\n",
+    "    DATASET_CONFIG_NAME,\n",
+    "    split=\"train\",\n",
+    "    streaming=True,\n",
+    "    trust_remote_code=True,\n",
+    ")\n",
+    "dataset = dataset.shuffle(seed=SEED)\n",
+    "dataset = dataset.take(DATASET_SIZE)\n",
+    "train_dataset = dataset.take(TRAIN_DATASET_SIZE)\n",
+    "val_dataset = dataset.skip(TRAIN_DATASET_SIZE)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We take sample text from the validation set in order to evaluate the generation of our trained model at the end. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester (Julia Jentsch) Angst… Doppelbödig.\n",
+      "in der rechten Armbeuge beim Öffnen des Mehlsilos zur Rettung der Bäckerstocher, welch ein Regiefehler! Der tiefergehende Sinn des Falles wird ansonsten auch nicht klar. Wirkt leider alles etwas zusammengeschustert.\n",
+      "Wer spielte die Hauptrolle in Film \"The International\" und wurde als potenzieller James Bond-Nachfolger gehandelt?\n"
+     ]
+    }
+   ],
+   "source": [
+    "sample_text = list(val_dataset.skip(10).take(1))[0][\"text\"]\n",
+    "print(sample_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Embeddings and Tokenizers"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We load the source tokenizer as well as the source model and extract the input embeddings matrix from it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "tags": [
+     "remove-output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "source_tokenizer = AutoTokenizer.from_pretrained(SOURCE_MODEL_NAME)\n",
+    "source_model = AutoModel.from_pretrained(SOURCE_MODEL_NAME)\n",
+    "source_embeddings_matrix = source_model.get_input_embeddings().weight.detach().numpy()\n",
+    "del source_model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We then load the target tokenizer as well as the helper model's embeddings to use as auxiliary embeddings. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_tokenizer = AutoTokenizer.from_pretrained(HELPER_MODEL_NAME)\n",
+    "target_auxiliary_embeddings = TransformersEmbeddings.from_model_name_or_path(\n",
+    "    HELPER_MODEL_NAME\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of tokens 172, tokens: ['mit', 'ĠEva', 'ĠMatt', 'es', 'Ġal', 's', 'ĠKl', 'ara', 'ĠBl', 'um', '.', 'ĠE', 'ine', 'ĠB', 'Ã¤', 'cker', 'st', 'och', 'ter', 'Ġstir', 'bt', 'Ġin', 'Ġder', 'ĠBack', 'r', 'Ã¶', 'h', 're', '.', 'ĠJet', 'z', 't', 'Ġhat', 'Ġi', 'h', 're', 'ĠSchw', 'ester', 'Ġ(', 'Jul', 'ia', 'ĠJ', 'ents', 'ch', ')', 'ĠAng', 'st', 'âĢ¦', 'ĠDo', 'ppel', 'b', 'Ã¶', 'dig', '.', 'Ċ', 'in', 'Ġder', 'Ġre', 'ch', 'ten', 'ĠArm', 'be', 'uge', 'Ġbe', 'im', 'ĠÃĸ', 'ff', 'nen', 'Ġdes', 'ĠMeh', 'ls', 'il', 'os', 'Ġz', 'ur', 'ĠR', 'ett', 'ung', 'Ġder', 'ĠB', 'Ã¤', 'cker', 'st', 'oc', 'her', ',', 'Ġwel', 'ch', 'Ġe', 'in', 'ĠReg', 'ief', 'eh', 'ler', '!', 'ĠDer', 'Ġt', 'ief', 'er', 'ge', 'hend', 'e', 'ĠSinn', 'Ġdes', 'ĠFall', 'es', 'Ġw', 'ird', 'Ġan', 'son', 'sten', 'Ġa', 'uch', 'Ġn', 'icht', 'Ġk', 'lar', '.', 'ĠW', 'irk', 't', 'Ġle', 'ider', 'Ġall', 'es', 'Ġet', 'was', 'Ġz', 'us', 'amm', 'enges', 'ch', 'ust', 'ert', '.', 'Ċ', 'W', 'er', 'Ġsp', 'iel', 'te', 'Ġdie', 'ĠHau', 'pt', 'rol', 'le', 'Ġin', 'ĠFilm', 'Ġ\"', 'The', 'ĠInternational', '\"', 'Ġund', 'Ġw', 'urd', 'e', 'Ġal', 's', 'Ġpot', 'enzie', 'ller', 'ĠJames', 'ĠBond', '-', 'N', 'ach', 'fol', 'ger', 'Ġge', 'hand', 'elt', '?']\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokens = source_tokenizer.tokenize(sample_text)\n",
+    "print(f\"Number of tokens {len(tokens)}, tokens: {tokens}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We then use the target tokenizer to convert the sample text to tokens and notice that the conversion creates fewer tokens than previously."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of tokens 108, tokens: ['mit', 'ĠEva', 'ĠMatt', 'es', 'Ġals', 'ĠKl', 'ara', 'ĠBlum', '.', 'ĠEine', 'ĠBÃ¤cker', 'st', 'ochter', 'Ġstirbt', 'Ġin', 'Ġder', 'ĠBack', 'rÃ¶hre', '.', 'ĠJetzt', 'Ġhat', 'Ġihre', 'ĠSchwester', 'Ġ(', 'Jul', 'ia', 'ĠJ', 'ent', 'sch', ')', 'ĠAngst', 'âĢ¦', 'ĠDoppel', 'bÃ¶', 'dig', '.', 'Ċ', 'in', 'Ġder', 'Ġrechten', 'ĠArmb', 'euge', 'Ġbeim', 'ĠÃĸffnen', 'Ġdes', 'ĠMehl', 'sil', 'os', 'Ġzur', 'ĠRettung', 'Ġder', 'ĠBÃ¤cker', 'st', 'ocher', ',', 'Ġwelch', 'Ġein', 'ĠReg', 'ief', 'ehler', '!', 'ĠDer', 'Ġtiefer', 'gehende', 'ĠSinn', 'Ġdes', 'ĠFall', 'es', 'Ġwird', 'Ġansonsten', 'Ġauch', 'Ġnicht', 'Ġklar', '.', 'ĠWir', 'kt', 'Ġleider', 'Ġalles', 'Ġetwas', 'Ġzusammen', 'gesch', 'uster', 't', '.', 'Ċ', 'Wer', 'Ġspielte', 'Ġdie', 'ĠHauptrolle', 'Ġin', 'ĠFilm', 'Ġ\"', 'The', 'ĠInternational', '\"', 'Ġund', 'Ġwurde', 'Ġals', 'Ġpoten', 'ziel', 'ler', 'ĠJames', 'ĠBond', '-', 'Nach', 'folger', 'Ġgehandelt', '?']\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokens = target_tokenizer.tokenize(sample_text)\n",
+    "print(f\"Number of tokens {len(tokens)}, tokens: {tokens}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We finally, instantiate the embedding initializer for CLP-Transfer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embedding_initializer = clp_transfer(\n",
+    "    source_tokenizer=source_tokenizer,\n",
+    "    source_embeddings_matrix=source_embeddings_matrix,\n",
+    "    target_tokenizer=target_tokenizer,\n",
+    "    target_auxiliary_embeddings=target_auxiliary_embeddings,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And then initialize the target embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cc6a005f21bb456faa0c53b27ebe5431",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Non-Overlapping Tokens: 0it [00:00, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "target_embeddings_matrix = embedding_initializer.initialize(seed=16, show_progress=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once we have the initialized embeddings matrix, we can use it to replace the embeddings matrix in the source model. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "tags": [
+     "remove-output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "target_model_wechsel = AutoModelForCausalLM.from_pretrained(SOURCE_MODEL_NAME)\n",
+    "\n",
+    "# Resize its embedding layer\n",
+    "target_model_wechsel.resize_token_embeddings(len(target_tokenizer))\n",
+    "\n",
+    "# Replace the source embeddings matrix with the target embeddings matrix\n",
+    "target_model_wechsel.get_input_embeddings().weight.data = torch.as_tensor(\n",
+    "    target_embeddings_matrix\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> We used `AutoModelForCausalLM` instead of `AutoModel` because we will train the newly initialized model for causal language modelling."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dataset preprocessing\n",
+    "\n",
+    "Before training, we must preprocess the training and validation sets by tokenizing the text, removing all other columns and then converting the resulting arrays to PyTorch tensors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = train_dataset.map(\n",
+    "    lambda x: target_tokenizer(x[\"text\"], truncation=True),\n",
+    "    batched=True,\n",
+    "    remove_columns=dataset.column_names,\n",
+    ")\n",
+    "train_dataset = train_dataset.with_format(\"torch\")\n",
+    "\n",
+    "val_dataset = val_dataset.map(\n",
+    "    lambda x: target_tokenizer(x[\"text\"], truncation=True),\n",
+    "    batched=True,\n",
+    "    remove_columns=dataset.column_names,\n",
+    ")\n",
+    "val_dataset = val_dataset.with_format(\"torch\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We define the training parameters and instantiate a [Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer) object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "max_steps is given, it will override any value given in num_train_epochs\n"
+     ]
+    }
+   ],
+   "source": [
+    "data_collator = DataCollatorForLanguageModeling(tokenizer=target_tokenizer, mlm=False)\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"/tmp/clp_transfer\",\n",
+    "    eval_strategy=\"steps\",\n",
+    "    report_to=\"tensorboard\",\n",
+    "    eval_steps=EVAL_STEPS // GRADIENT_ACCUMULATION_STEPS,\n",
+    "    max_steps=MAX_TRAIN_STEPS // GRADIENT_ACCUMULATION_STEPS,\n",
+    "    per_device_train_batch_size=TRAIN_BATCH_SIZE,\n",
+    "    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n",
+    "    learning_rate=LEARNING_RATE,\n",
+    "    weight_decay=WEIGHT_DECAY,\n",
+    "    adam_epsilon=ADAM_EPSILON,\n",
+    "    adam_beta1=ADAM_BETA1,\n",
+    "    adam_beta2=ADAM_BETA2,\n",
+    "    bf16=True,\n",
+    ")\n",
+    "\n",
+    "if target_tokenizer.pad_token is None:\n",
+    "    target_tokenizer.pad_token = target_tokenizer.eos_token\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model=target_model_wechsel,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    eval_dataset=val_dataset,\n",
+    "    data_collator=data_collator,\n",
+    "    tokenizer=target_tokenizer,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We evaluate the model before training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluation loss before training: 10.631\n"
+     ]
+    }
+   ],
+   "source": [
+    "eval_loss = trainer.evaluate()[\"eval_loss\"]\n",
+    "print(f\"Evaluation loss before training: {eval_loss:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We then train the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='750' max='750' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [750/750 9:37:24, Epoch 11/9223372036854775807]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "      <th>Validation Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>62</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>6.976626</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>124</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>6.584036</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>186</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>6.243427</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>248</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>5.986516</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>310</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>5.811096</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>372</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>5.672745</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>434</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>5.567986</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>496</td>\n",
+       "      <td>No log</td>\n",
+       "      <td>5.485695</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>558</td>\n",
+       "      <td>6.278000</td>\n",
+       "      <td>5.421839</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>620</td>\n",
+       "      <td>6.278000</td>\n",
+       "      <td>5.378000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>682</td>\n",
+       "      <td>6.278000</td>\n",
+       "      <td>5.350171</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>744</td>\n",
+       "      <td>6.278000</td>\n",
+       "      <td>5.338133</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=750, training_loss=6.01977197265625, metrics={'train_runtime': 34687.4607, 'train_samples_per_second': 5.535, 'train_steps_per_second': 0.022, 'total_flos': 5.861596369744896e+17, 'train_loss': 6.01977197265625, 'epoch': 11.083333333333334})"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We finally evaluate the model after the training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Evaluation loss after training: 5.338\n"
+     ]
+    }
+   ],
+   "source": [
+    "eval_loss = trainer.evaluate()[\"eval_loss\"]\n",
+    "print(f\"Evaluation loss after training: {eval_loss:.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As an additional evaluation, we take the sample text, truncate it and then make the trained model generate a completion for it "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "hide-input"
+    ]
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
+      "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Original Text:\n",
+      "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester (Julia Jentsch) Angst… Doppelbödig.\n",
+      "in der rechten Armbeuge beim Öffnen des Mehlsilos zur Rettung der Bäckerstocher, welch ein Regiefehler! Der tiefergehende Sinn des Falles wird ansonsten auch nicht klar. Wirkt leider alles etwas zusammengeschustert.\n",
+      "Wer spielte die Hauptrolle in Film \"The International\" und wurde als potenzieller James Bond-Nachfolger gehandelt?\n",
+      "---\n",
+      "Shortened Text:\n",
+      "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester\n",
+      "---\n",
+      "Generated Text:\n",
+      "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester die Welt, das sie mit dem Mann und den anderen Menschen zu tun ist:\n",
+      "Die Frau wird von einem kleinen Kind aus einer großen Stadt auf ihrem Weg im Wald gebracht worden – auch wenn es sich um eine große Geschichte gibt! Die Mutter wurde am Ende des Jahres nach Berlin-Wittenberg (Bavaria) an diesem Tag wieder vor Ort sein; er war ein sehr gut bewebtter Leben für seine Familie… Aber ich habe mich nicht mehr so viel machen 🙂 Und da kann man ja schon mal noch einen paar Tage Zeit haben 😉 . Ich bin mir aber immer nur einmal etwas richtig gemacht wie ihr meine Mama oder Papa sind😂🤷️ #sunnylife 🌸#tweeting @kim_matthesbaby A post by kimi matzhaynesblog ⚽⛳❗‍♀☺ pic.twitter.com/w7qxrY6X5I — Kim MATTES 👩👨✈ ✅ ☆ ㄹˋ〜(をローで、結対の術架院京伝。 )[1] https://www.youtube.com/watch?v=8z4RpJy2fjE&\n"
+     ]
+    }
+   ],
+   "source": [
+    "sample_input_ids = target_tokenizer(sample_text)[\"input_ids\"]\n",
+    "shortened_input_ids = sample_input_ids[: len(sample_input_ids) // 3 - 13]\n",
+    "shortened_text = target_tokenizer.decode(shortened_input_ids, add_special_tokens=False)\n",
+    "\n",
+    "generated_token_ids = (\n",
+    "    trainer.model.generate(\n",
+    "        torch.as_tensor(shortened_input_ids).reshape(1, -1).to(trainer.model.device),\n",
+    "        max_length=300,\n",
+    "        min_length=10,\n",
+    "        top_p=0.9,\n",
+    "        temperature=0.9,\n",
+    "        repetition_penalty=2.0,\n",
+    "    )\n",
+    "    .detach()\n",
+    "    .cpu()\n",
+    "    .numpy()\n",
+    "    .reshape(-1)\n",
+    ")\n",
+    "generated_tokens = target_tokenizer.decode(\n",
+    "    generated_token_ids, add_special_tokens=False\n",
+    ")\n",
+    "print(\"Original Text:\")\n",
+    "print(sample_text)\n",
+    "print(\"---\")\n",
+    "print(\"Shortened Text:\")\n",
+    "print(shortened_text)\n",
+    "print(\"---\")\n",
+    "print(\"Generated Text:\")\n",
+    "print(generated_tokens)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The generated text's quality is not bad but the model needs further training on more data.\n",
+    "This was just done for the sake of the tutorial and is not meant to be a full-blown model training."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Summary\n",
+    "\n",
+    "In this tutorial, we have seen how to use CLP-Transfer in order to transfer a pre-trained language model to a new language."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "langsfer-I0oHYpHZ-py3.11",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 8e110b2a9004bc073420e3e4c38edf0a20d0394a Mon Sep 17 00:00:00 2001
From: AnesBenmerzoug <anes.benmerzoug@gmail.com>
Date: Fri, 15 Nov 2024 08:00:53 +0100
Subject: [PATCH 4/4] Update readme

---
 README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 15b05de..170327c 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ Language transfer refers to a few related things:
   from another LLM trained in high-resource source language (e.g. English),
 - extending the vocabulary of an LLM by adding new tokens and initializing their embeddings
   in a manner that allows them to be used with little to no extra training,
-- specializing the vocabulary of a multilingual LLM to one of its supported languages. 
+- specializing the vocabulary of a multilingual LLM to one of its supported languages.
 
 The library currently implements the following methods:
 
@@ -59,6 +59,7 @@ pip install .
 The following notebooks serve as tutorials for users of the package:
 
 - [WECHSEL Tutorial](notebooks/WECHSEL_tutorial.ipynb)
+- [CLP-Transfer Tutorial](notebooks/CLP_Transfer_tutorial.ipynb)
 
 ### Example
 
@@ -127,11 +128,11 @@ Refer to the [contributing guide](CONTRIBUTING.md) for instructions on you can m
 
 ## Logo
 
-The langsfer logo was created by my good friend [Zakaria Taleb Hacine](https://behance.net/zakariahacine), a 3D artist with 
+The langsfer logo was created by my good friend [Zakaria Taleb Hacine](https://behance.net/zakariahacine), a 3D artist with
 industry experience and a packed portfolio.
 
 The logo contains the latin alphabet letters A and I which are an acronym for Artificial Intelligence and the arabic alphabet letters
-أ and ذ which are an acronym for الذكاء الاصطناعي, which is Artificial Intelligence in arabic.
+أ and ذ which are an acronym for ذكاء اصطناعي, which is Artificial Intelligence in arabic.
 
 The fonts used are [Ethnocentric Regular](https://www.myfonts.com/products/ethnocentric-ethnocentric-970121) and [Readex Pro](https://fonts.google.com/specimen/Readex+Pro).