From c775057d883396e9e3e0e558a09d013863a0f655 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Thu, 14 Nov 2024 20:29:52 +0100 Subject: [PATCH 1/4] Adapt weight average initialization to clp-transfer --- src/langsfer/initialization.py | 122 +++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 44 deletions(-) diff --git a/src/langsfer/initialization.py b/src/langsfer/initialization.py index 135c807..ac62ca8 100644 --- a/src/langsfer/initialization.py +++ b/src/langsfer/initialization.py @@ -88,29 +88,6 @@ def initialize( ) -> NDArray: rng = np.random.default_rng(seed) - # Map source and target subword tokens to auxiliary token space - source_subword_embeddings = self._map_tokens_into_embedding_space( - self.source_tokenizer, - self.source_auxiliary_embeddings, - ) - target_subword_embeddings = self._map_tokens_into_embedding_space( - self.target_tokenizer, - self.target_auxiliary_embeddings, - ) - - # Align source to target - source_subword_embeddings = self.alignment_strategy.apply( - source_subword_embeddings - ) - - # TODO: investigate why this is needed - source_subword_embeddings /= ( - np.linalg.norm(source_subword_embeddings, axis=1)[:, np.newaxis] + 1e-8 - ) - target_subword_embeddings /= ( - np.linalg.norm(target_subword_embeddings, axis=1)[:, np.newaxis] + 1e-8 - ) - # Initialize target embeddings as random target_embeddings_matrix = rng.normal( np.mean(self.source_embeddings_matrix, axis=0), @@ -125,37 +102,97 @@ def initialize( overlapping_tokens, non_overlapping_tokens = self.token_overlap_strategy.apply( self.source_tokenizer, self.target_tokenizer ) + overlapping_source_token_ids = list( + self.source_tokenizer.convert_tokens_to_ids(overlapping_tokens) + ) + overlapping_target_token_ids = list( + self.target_tokenizer.convert_tokens_to_ids(overlapping_tokens) + ) + non_overlapping_target_token_ids = list( + self.target_tokenizer.convert_tokens_to_ids(non_overlapping_tokens) + ) # Copy overlapping token embedding vectors - for token in tqdm( - overlapping_tokens, desc="Overlapping Tokens", disable=not show_progress - ): - source_token_id = self.source_tokenizer.convert_tokens_to_ids(token) - target_token_id = self.target_tokenizer.convert_tokens_to_ids(token) - target_embeddings_matrix[target_token_id] = self.source_embeddings_matrix[ - source_token_id - ] + # shape of assigned: (n_target_tokens, n_overlapping_tokens) + target_embeddings_matrix[overlapping_target_token_ids] = ( + self.source_embeddings_matrix[overlapping_source_token_ids] + ) # Compute target embedding vectors of non overlapping tokens # as weighted average of source tokens + target_embeddings_matrix[non_overlapping_target_token_ids] = ( + self._compute_non_overlapping_token_embeddings( + overlapping_source_token_ids=overlapping_source_token_ids, + overlapping_target_token_ids=overlapping_target_token_ids, + non_overlapping_target_token_ids=non_overlapping_target_token_ids, + show_progress=show_progress, + ) + ) + return target_embeddings_matrix - non_overlapping_token_ids = list( - sorted(self.target_tokenizer.convert_tokens_to_ids(non_overlapping_tokens)) + def _compute_non_overlapping_token_embeddings( + self, + overlapping_target_token_ids: list[int], + overlapping_source_token_ids: list[int], + non_overlapping_target_token_ids: list[int], + *, + show_progress: bool = False, + ) -> NDArray: + # Map source and target subword tokens to auxiliary token space + target_subword_embeddings = self._map_tokens_into_auxiliary_embedding_space( + self.target_tokenizer, + self.target_auxiliary_embeddings, + ) + # TODO: investigate why this is needed + target_subword_embeddings /= ( + np.linalg.norm(target_subword_embeddings, axis=1)[:, np.newaxis] + 1e-8 ) + if self.source_auxiliary_embeddings is None: + reference_subword_embeddings = target_subword_embeddings[ + overlapping_target_token_ids + ].copy() + source_embeddings_matrix = self.source_embeddings_matrix[ + overlapping_source_token_ids + ] + else: + reference_subword_embeddings = ( + self._map_tokens_into_auxiliary_embedding_space( + self.source_tokenizer, + self.source_auxiliary_embeddings, + ) + ) + + # Align source to target + reference_subword_embeddings = self.alignment_strategy.apply( + reference_subword_embeddings + ) + + # TODO: investigate why this is needed + reference_subword_embeddings /= ( + np.linalg.norm(reference_subword_embeddings, axis=1)[:, np.newaxis] + + 1e-8 + ) + + source_embeddings_matrix = self.source_embeddings_matrix + + # Compute target embedding vectors of non overlapping tokens + # as weighted average of source tokens + target_embedding_vec_batches = [] + for token_batch_ids in tqdm( - chunked(non_overlapping_token_ids, self.batch_size), + chunked(non_overlapping_target_token_ids, self.batch_size), desc="Non-Overlapping Tokens", disable=not show_progress, ): # Compute similarities - # shape: (batch_size, n_source_tokens) + # shape: (batch_size, n_reference_embeddings) similarities = self.similarity_strategy.apply( target_subword_embeddings[token_batch_ids], - source_subword_embeddings, + reference_subword_embeddings, ) # compute weights - # shape: (batch_size, n_source_tokens) + # shape: (batch_size, n_reference_embeddings) weights = self.weights_strategy.apply(similarities) # weighted average of source model's overlapping token embeddings @@ -164,17 +201,14 @@ def initialize( weights_row_sum = weights.sum(axis=1) # shape: (batch_size, source_embedding_dim) non_overlapping_embedding_vectors = ( - weights @ self.source_embeddings_matrix / weights_row_sum[:, np.newaxis] + weights @ source_embeddings_matrix / weights_row_sum[:, np.newaxis] ) - target_embeddings_matrix[token_batch_ids] = ( - non_overlapping_embedding_vectors - ) - - return target_embeddings_matrix + target_embedding_vec_batches.append(non_overlapping_embedding_vectors) + return np.concatenate(target_embedding_vec_batches, axis=0) @staticmethod - def _map_tokens_into_embedding_space( + def _map_tokens_into_auxiliary_embedding_space( tokenizer: PreTrainedTokenizerBase, embeddings: AuxiliaryEmbeddings, ) -> NDArray: From 694b76cbb89bb88cffd57f177f50cb365127d4d8 Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Thu, 14 Nov 2024 20:30:21 +0100 Subject: [PATCH 2/4] Improve high level functions docstrings --- src/langsfer/high_level.py | 47 ++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/src/langsfer/high_level.py b/src/langsfer/high_level.py index 8c15214..d7d8d47 100644 --- a/src/langsfer/high_level.py +++ b/src/langsfer/high_level.py @@ -60,16 +60,16 @@ def wechsel( - trainining fastText embeddings from scratch. Args: - source_tokenizer: Source model's tokenizer - source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer - target_tokenizer: Target model's tokenizer - target_auxiliary_embeddings: - source_auxiliary_embeddings: + source_tokenizer: Source model's tokenizer. + source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer. + target_tokenizer: Target model's tokenizer. + target_auxiliary_embeddings: FastText auxiliary embeddings in the target language. + source_auxiliary_embeddings: FastText auxiliary embeddings in the source language. bilingual_dictionary: Dictionary mapping words in source language to words in target language. - bilingual_dictionary_file: Path to a bilingual dictionary file - temperature: Softmax temperature to apply for weight computation - k: Number of closest / most similar tokens to consider for weight computation - batch_size: Size of the batches of non-overlapping token computations + bilingual_dictionary_file: Path to a bilingual dictionary file. + temperature: Softmax temperature to apply for weight computation. + k: Number of closest / most similar tokens to consider for weight computation. + batch_size: Size of the batches of non-overlapping token computations. """ embeddings_initializer = WeightedAverageEmbeddingsInitialization( source_tokenizer=source_tokenizer, @@ -105,12 +105,19 @@ def clp_transfer( Described in [CLP-Transfer: Efficient language model training through cross-lingual and progressive transfer learning.](https://arxiv.org/abs/2301.09626) Ostendorff, Malte, and Georg Rehm. arXiv preprint arXiv:2301.09626 (2023). + The method requires as input: + + - a tokenizer in the source language, + - a pre-trained language model in the source language, + - a tokenizer in the target language, + - a helper pre-trained language model in the target language. + Args: - source_tokenizer: Source model's tokenizer - source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer - target_tokenizer: Target model's tokenizer - target_auxiliary_embeddings: - batch_size: Size of the batches of non-overlapping token computations + source_tokenizer: Source model's tokenizer. + source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer. + target_tokenizer: Target model's tokenizer. + target_auxiliary_embeddings: Auxiliary embeddingsin the target language. + batch_size: Size of the batches of non-overlapping token computations. """ embeddings_initializer = WeightedAverageEmbeddingsInitialization( source_tokenizer=source_tokenizer, @@ -140,12 +147,12 @@ def focus( Described in [FOCUS: Effective Embedding Initialization for Specializing Pretrained Multilingual Models on a Single Language.](https://arxiv.org/abs/2305.14481) Dobler, Konstantin, and Gerard de Melo. arXiv preprint arXiv:2305.14481 (2023). Args: - source_tokenizer: Source model's tokenizer - source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer - target_tokenizer: Target model's tokenizer - target_auxiliary_embeddings: - source_auxiliary_embeddings: - batch_size: Size of the batches of non-overlapping token computations + source_tokenizer: Source model's tokenizer. + source_embeddings_matrix: Matrix or 2D array containing the weights of the source model's embedding layer. + target_tokenizer: Target model's tokenizer. + target_auxiliary_embeddings: FastText auxiliary embeddings in the target language. + source_auxiliary_embeddings: FastText auxiliary embeddings in the source language. + batch_size: Size of the batches of non-overlapping token computations. """ embeddings_initializer = WeightedAverageEmbeddingsInitialization( source_tokenizer=source_tokenizer, From 0f5a6ffb677023b54c786e8f09a6900b8164745f Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Fri, 15 Nov 2024 08:00:39 +0100 Subject: [PATCH 3/4] Add tutorial notebook for CLP-Transfer --- notebooks/CLPT_tutorial.ipynb | 743 ++++++++++++++++++++++++++++++++++ 1 file changed, 743 insertions(+) create mode 100644 notebooks/CLPT_tutorial.ipynb diff --git a/notebooks/CLPT_tutorial.ipynb b/notebooks/CLPT_tutorial.ipynb new file mode 100644 index 0000000..b0705e7 --- /dev/null +++ b/notebooks/CLPT_tutorial.ipynb @@ -0,0 +1,743 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CLP-Transfer Tutorial\n", + "\n", + "In this tutorial, we will use Langsfer to transfer a model trained in English to German with the [CLP-Transfer](https://arxiv.org/abs/2301.09626) method, similarily to one of the experiments described in the paper.\n", + "\n", + "Cross-Lingual and Progressive Transfer, or CLP-Transfer for short, is another cross-lingual language transfer method that efficiently initializes the embedding parameters of a language model in a target language using the embedding parameters from an existing model in a source language as well as the embedding parameters of a helper model in the target language.\n", + "\n", + "The method requires as input:\n", + "\n", + "- a tokenizer in the source language,\n", + "- a pre-trained language model in the source language,\n", + "- a tokenizer in the target language,\n", + "- a helper pre-trained language model in the target language.\n", + "\n", + "For the tutorial, we will use as much as possible the same parameters as described in the paper:\n", + "\n", + "- For the source model and tokenizer, we will use [gpt2-large](openai-community/gpt2-large),\n", + "- For the helper model and target tokenizer, we will use [benjamin/gpt2-wechsel-german](https://huggingface.co/benjamin/gpt2-wechsel-german).\n", + "\n", + "For the sake of brevity, we will however use fewer training samples and steps." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "We begin by importing libraries and setting some defaults." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [ + "remove-cell" + ] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%load_ext tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "tags": [ + "hide-cell" + ] + }, + "outputs": [], + "source": [ + "import random\n", + "import warnings\n", + "\n", + "import datasets\n", + "import numpy as np\n", + "import torch\n", + "from transformers import (\n", + " AutoModel,\n", + " AutoTokenizer,\n", + " AutoModelForCausalLM,\n", + " DataCollatorForLanguageModeling,\n", + " TrainingArguments,\n", + " Trainer,\n", + ")\n", + "\n", + "warnings.simplefilter(\"ignore\")\n", + "\n", + "# Constants\n", + "SOURCE_MODEL_NAME = \"openai-community/gpt2-large\"\n", + "HELPER_MODEL_NAME = \"benjamin/gpt2-wechsel-german\"\n", + "DATASET_NAME = \"oscar-corpus/oscar\"\n", + "DATASET_CONFIG_NAME = \"unshuffled_deduplicated_de\"\n", + "DATASET_SIZE = 20000\n", + "TRAIN_DATASET_SIZE = 16000\n", + "TRAIN_BATCH_SIZE = 2\n", + "GRADIENT_ACCUMULATION_STEPS = 64\n", + "EVAL_STEPS = 4000\n", + "MAX_TRAIN_STEPS = 48000\n", + "LEARNING_RATE = 1e-4\n", + "WEIGHT_DECAY = 0.01\n", + "ADAM_EPSILON = 1e-6\n", + "ADAM_BETA1 = 0.9\n", + "ADAM_BETA2 = 0.98\n", + "SEED = 16\n", + "\n", + "random.seed(SEED)\n", + "np.random.seed(SEED)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use the following functions and classes from Langsfer." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "%autoreload\n", + "from langsfer.high_level import clp_transfer\n", + "from langsfer.embeddings import TransformersEmbeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dataset\n", + "\n", + "We use the [datasets](https://huggingface.co/docs/datasets/index) library to load the [oscar](https://huggingface.co/datasets/oscar-corpus/oscar), which stands for **O**pen **S**uper-large **C**rawled **A**LMAnaCH co**R**pus, dataset's german configuration and then take a limited number of samples from it for training and validation." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = datasets.load_dataset(\n", + " DATASET_NAME,\n", + " DATASET_CONFIG_NAME,\n", + " split=\"train\",\n", + " streaming=True,\n", + " trust_remote_code=True,\n", + ")\n", + "dataset = dataset.shuffle(seed=SEED)\n", + "dataset = dataset.take(DATASET_SIZE)\n", + "train_dataset = dataset.take(TRAIN_DATASET_SIZE)\n", + "val_dataset = dataset.skip(TRAIN_DATASET_SIZE)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We take sample text from the validation set in order to evaluate the generation of our trained model at the end. " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester (Julia Jentsch) Angst… Doppelbödig.\n", + "in der rechten Armbeuge beim Öffnen des Mehlsilos zur Rettung der Bäckerstocher, welch ein Regiefehler! Der tiefergehende Sinn des Falles wird ansonsten auch nicht klar. Wirkt leider alles etwas zusammengeschustert.\n", + "Wer spielte die Hauptrolle in Film \"The International\" und wurde als potenzieller James Bond-Nachfolger gehandelt?\n" + ] + } + ], + "source": [ + "sample_text = list(val_dataset.skip(10).take(1))[0][\"text\"]\n", + "print(sample_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embeddings and Tokenizers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We load the source tokenizer as well as the source model and extract the input embeddings matrix from it." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "tags": [ + "remove-output" + ] + }, + "outputs": [], + "source": [ + "source_tokenizer = AutoTokenizer.from_pretrained(SOURCE_MODEL_NAME)\n", + "source_model = AutoModel.from_pretrained(SOURCE_MODEL_NAME)\n", + "source_embeddings_matrix = source_model.get_input_embeddings().weight.detach().numpy()\n", + "del source_model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then load the target tokenizer as well as the helper model's embeddings to use as auxiliary embeddings. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "target_tokenizer = AutoTokenizer.from_pretrained(HELPER_MODEL_NAME)\n", + "target_auxiliary_embeddings = TransformersEmbeddings.from_model_name_or_path(\n", + " HELPER_MODEL_NAME\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of tokens 172, tokens: ['mit', 'ĠEva', 'ĠMatt', 'es', 'Ġal', 's', 'ĠKl', 'ara', 'ĠBl', 'um', '.', 'ĠE', 'ine', 'ĠB', 'ä', 'cker', 'st', 'och', 'ter', 'Ġstir', 'bt', 'Ġin', 'Ġder', 'ĠBack', 'r', 'ö', 'h', 're', '.', 'ĠJet', 'z', 't', 'Ġhat', 'Ġi', 'h', 're', 'ĠSchw', 'ester', 'Ġ(', 'Jul', 'ia', 'ĠJ', 'ents', 'ch', ')', 'ĠAng', 'st', 'âĢ¦', 'ĠDo', 'ppel', 'b', 'ö', 'dig', '.', 'Ċ', 'in', 'Ġder', 'Ġre', 'ch', 'ten', 'ĠArm', 'be', 'uge', 'Ġbe', 'im', 'ĠÃĸ', 'ff', 'nen', 'Ġdes', 'ĠMeh', 'ls', 'il', 'os', 'Ġz', 'ur', 'ĠR', 'ett', 'ung', 'Ġder', 'ĠB', 'ä', 'cker', 'st', 'oc', 'her', ',', 'Ġwel', 'ch', 'Ġe', 'in', 'ĠReg', 'ief', 'eh', 'ler', '!', 'ĠDer', 'Ġt', 'ief', 'er', 'ge', 'hend', 'e', 'ĠSinn', 'Ġdes', 'ĠFall', 'es', 'Ġw', 'ird', 'Ġan', 'son', 'sten', 'Ġa', 'uch', 'Ġn', 'icht', 'Ġk', 'lar', '.', 'ĠW', 'irk', 't', 'Ġle', 'ider', 'Ġall', 'es', 'Ġet', 'was', 'Ġz', 'us', 'amm', 'enges', 'ch', 'ust', 'ert', '.', 'Ċ', 'W', 'er', 'Ġsp', 'iel', 'te', 'Ġdie', 'ĠHau', 'pt', 'rol', 'le', 'Ġin', 'ĠFilm', 'Ġ\"', 'The', 'ĠInternational', '\"', 'Ġund', 'Ġw', 'urd', 'e', 'Ġal', 's', 'Ġpot', 'enzie', 'ller', 'ĠJames', 'ĠBond', '-', 'N', 'ach', 'fol', 'ger', 'Ġge', 'hand', 'elt', '?']\n" + ] + } + ], + "source": [ + "tokens = source_tokenizer.tokenize(sample_text)\n", + "print(f\"Number of tokens {len(tokens)}, tokens: {tokens}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then use the target tokenizer to convert the sample text to tokens and notice that the conversion creates fewer tokens than previously." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of tokens 108, tokens: ['mit', 'ĠEva', 'ĠMatt', 'es', 'Ġals', 'ĠKl', 'ara', 'ĠBlum', '.', 'ĠEine', 'ĠBäcker', 'st', 'ochter', 'Ġstirbt', 'Ġin', 'Ġder', 'ĠBack', 'röhre', '.', 'ĠJetzt', 'Ġhat', 'Ġihre', 'ĠSchwester', 'Ġ(', 'Jul', 'ia', 'ĠJ', 'ent', 'sch', ')', 'ĠAngst', 'âĢ¦', 'ĠDoppel', 'bö', 'dig', '.', 'Ċ', 'in', 'Ġder', 'Ġrechten', 'ĠArmb', 'euge', 'Ġbeim', 'ĠÃĸffnen', 'Ġdes', 'ĠMehl', 'sil', 'os', 'Ġzur', 'ĠRettung', 'Ġder', 'ĠBäcker', 'st', 'ocher', ',', 'Ġwelch', 'Ġein', 'ĠReg', 'ief', 'ehler', '!', 'ĠDer', 'Ġtiefer', 'gehende', 'ĠSinn', 'Ġdes', 'ĠFall', 'es', 'Ġwird', 'Ġansonsten', 'Ġauch', 'Ġnicht', 'Ġklar', '.', 'ĠWir', 'kt', 'Ġleider', 'Ġalles', 'Ġetwas', 'Ġzusammen', 'gesch', 'uster', 't', '.', 'Ċ', 'Wer', 'Ġspielte', 'Ġdie', 'ĠHauptrolle', 'Ġin', 'ĠFilm', 'Ġ\"', 'The', 'ĠInternational', '\"', 'Ġund', 'Ġwurde', 'Ġals', 'Ġpoten', 'ziel', 'ler', 'ĠJames', 'ĠBond', '-', 'Nach', 'folger', 'Ġgehandelt', '?']\n" + ] + } + ], + "source": [ + "tokens = target_tokenizer.tokenize(sample_text)\n", + "print(f\"Number of tokens {len(tokens)}, tokens: {tokens}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We finally, instantiate the embedding initializer for CLP-Transfer" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_initializer = clp_transfer(\n", + " source_tokenizer=source_tokenizer,\n", + " source_embeddings_matrix=source_embeddings_matrix,\n", + " target_tokenizer=target_tokenizer,\n", + " target_auxiliary_embeddings=target_auxiliary_embeddings,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then initialize the target embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cc6a005f21bb456faa0c53b27ebe5431", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Non-Overlapping Tokens: 0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "target_embeddings_matrix = embedding_initializer.initialize(seed=16, show_progress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have the initialized embeddings matrix, we can use it to replace the embeddings matrix in the source model. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "tags": [ + "remove-output" + ] + }, + "outputs": [], + "source": [ + "target_model_wechsel = AutoModelForCausalLM.from_pretrained(SOURCE_MODEL_NAME)\n", + "\n", + "# Resize its embedding layer\n", + "target_model_wechsel.resize_token_embeddings(len(target_tokenizer))\n", + "\n", + "# Replace the source embeddings matrix with the target embeddings matrix\n", + "target_model_wechsel.get_input_embeddings().weight.data = torch.as_tensor(\n", + " target_embeddings_matrix\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> We used `AutoModelForCausalLM` instead of `AutoModel` because we will train the newly initialized model for causal language modelling." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dataset preprocessing\n", + "\n", + "Before training, we must preprocess the training and validation sets by tokenizing the text, removing all other columns and then converting the resulting arrays to PyTorch tensors." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "train_dataset = train_dataset.map(\n", + " lambda x: target_tokenizer(x[\"text\"], truncation=True),\n", + " batched=True,\n", + " remove_columns=dataset.column_names,\n", + ")\n", + "train_dataset = train_dataset.with_format(\"torch\")\n", + "\n", + "val_dataset = val_dataset.map(\n", + " lambda x: target_tokenizer(x[\"text\"], truncation=True),\n", + " batched=True,\n", + " remove_columns=dataset.column_names,\n", + ")\n", + "val_dataset = val_dataset.with_format(\"torch\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We define the training parameters and instantiate a [Trainer](https://huggingface.co/docs/transformers/en/main_classes/trainer) object." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "max_steps is given, it will override any value given in num_train_epochs\n" + ] + } + ], + "source": [ + "data_collator = DataCollatorForLanguageModeling(tokenizer=target_tokenizer, mlm=False)\n", + "\n", + "training_args = TrainingArguments(\n", + " output_dir=\"/tmp/clp_transfer\",\n", + " eval_strategy=\"steps\",\n", + " report_to=\"tensorboard\",\n", + " eval_steps=EVAL_STEPS // GRADIENT_ACCUMULATION_STEPS,\n", + " max_steps=MAX_TRAIN_STEPS // GRADIENT_ACCUMULATION_STEPS,\n", + " per_device_train_batch_size=TRAIN_BATCH_SIZE,\n", + " gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,\n", + " learning_rate=LEARNING_RATE,\n", + " weight_decay=WEIGHT_DECAY,\n", + " adam_epsilon=ADAM_EPSILON,\n", + " adam_beta1=ADAM_BETA1,\n", + " adam_beta2=ADAM_BETA2,\n", + " bf16=True,\n", + ")\n", + "\n", + "if target_tokenizer.pad_token is None:\n", + " target_tokenizer.pad_token = target_tokenizer.eos_token\n", + "\n", + "trainer = Trainer(\n", + " model=target_model_wechsel,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=val_dataset,\n", + " data_collator=data_collator,\n", + " tokenizer=target_tokenizer,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We evaluate the model before training" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluation loss before training: 10.631\n" + ] + } + ], + "source": [ + "eval_loss = trainer.evaluate()[\"eval_loss\"]\n", + "print(f\"Evaluation loss before training: {eval_loss:.3f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We then train the model" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [750/750 9:37:24, Epoch 11/9223372036854775807]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining LossValidation Loss
62No log6.976626
124No log6.584036
186No log6.243427
248No log5.986516
310No log5.811096
372No log5.672745
434No log5.567986
496No log5.485695
5586.2780005.421839
6206.2780005.378000
6826.2780005.350171
7446.2780005.338133

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "TrainOutput(global_step=750, training_loss=6.01977197265625, metrics={'train_runtime': 34687.4607, 'train_samples_per_second': 5.535, 'train_steps_per_second': 0.022, 'total_flos': 5.861596369744896e+17, 'train_loss': 6.01977197265625, 'epoch': 11.083333333333334})" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "trainer.train()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We finally evaluate the model after the training" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluation loss after training: 5.338\n" + ] + } + ], + "source": [ + "eval_loss = trainer.evaluate()[\"eval_loss\"]\n", + "print(f\"Evaluation loss after training: {eval_loss:.3f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As an additional evaluation, we take the sample text, truncate it and then make the trained model generate a completion for it " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", + "Setting `pad_token_id` to `eos_token_id`:None for open-end generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Original Text:\n", + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester (Julia Jentsch) Angst… Doppelbödig.\n", + "in der rechten Armbeuge beim Öffnen des Mehlsilos zur Rettung der Bäckerstocher, welch ein Regiefehler! Der tiefergehende Sinn des Falles wird ansonsten auch nicht klar. Wirkt leider alles etwas zusammengeschustert.\n", + "Wer spielte die Hauptrolle in Film \"The International\" und wurde als potenzieller James Bond-Nachfolger gehandelt?\n", + "---\n", + "Shortened Text:\n", + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester\n", + "---\n", + "Generated Text:\n", + "mit Eva Mattes als Klara Blum. Eine Bäckerstochter stirbt in der Backröhre. Jetzt hat ihre Schwester die Welt, das sie mit dem Mann und den anderen Menschen zu tun ist:\n", + "Die Frau wird von einem kleinen Kind aus einer großen Stadt auf ihrem Weg im Wald gebracht worden – auch wenn es sich um eine große Geschichte gibt! Die Mutter wurde am Ende des Jahres nach Berlin-Wittenberg (Bavaria) an diesem Tag wieder vor Ort sein; er war ein sehr gut bewebtter Leben für seine Familie… Aber ich habe mich nicht mehr so viel machen 🙂 Und da kann man ja schon mal noch einen paar Tage Zeit haben 😉 . Ich bin mir aber immer nur einmal etwas richtig gemacht wie ihr meine Mama oder Papa sind😂🤷️ #sunnylife 🌸#tweeting @kim_matthesbaby A post by kimi matzhaynesblog ⚽⛳❗‍♀☺ pic.twitter.com/w7qxrY6X5I — Kim MATTES 👩👨✈ ✅ ☆ ㄹˋ〜(をローで、結対の術架院京伝。 )[1] https://www.youtube.com/watch?v=8z4RpJy2fjE&\n" + ] + } + ], + "source": [ + "sample_input_ids = target_tokenizer(sample_text)[\"input_ids\"]\n", + "shortened_input_ids = sample_input_ids[: len(sample_input_ids) // 3 - 13]\n", + "shortened_text = target_tokenizer.decode(shortened_input_ids, add_special_tokens=False)\n", + "\n", + "generated_token_ids = (\n", + " trainer.model.generate(\n", + " torch.as_tensor(shortened_input_ids).reshape(1, -1).to(trainer.model.device),\n", + " max_length=300,\n", + " min_length=10,\n", + " top_p=0.9,\n", + " temperature=0.9,\n", + " repetition_penalty=2.0,\n", + " )\n", + " .detach()\n", + " .cpu()\n", + " .numpy()\n", + " .reshape(-1)\n", + ")\n", + "generated_tokens = target_tokenizer.decode(\n", + " generated_token_ids, add_special_tokens=False\n", + ")\n", + "print(\"Original Text:\")\n", + "print(sample_text)\n", + "print(\"---\")\n", + "print(\"Shortened Text:\")\n", + "print(shortened_text)\n", + "print(\"---\")\n", + "print(\"Generated Text:\")\n", + "print(generated_tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The generated text's quality is not bad but the model needs further training on more data.\n", + "This was just done for the sake of the tutorial and is not meant to be a full-blown model training." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary\n", + "\n", + "In this tutorial, we have seen how to use CLP-Transfer in order to transfer a pre-trained language model to a new language." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "langsfer-I0oHYpHZ-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 8e110b2a9004bc073420e3e4c38edf0a20d0394a Mon Sep 17 00:00:00 2001 From: AnesBenmerzoug Date: Fri, 15 Nov 2024 08:00:53 +0100 Subject: [PATCH 4/4] Update readme --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 15b05de..170327c 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Language transfer refers to a few related things: from another LLM trained in high-resource source language (e.g. English), - extending the vocabulary of an LLM by adding new tokens and initializing their embeddings in a manner that allows them to be used with little to no extra training, -- specializing the vocabulary of a multilingual LLM to one of its supported languages. +- specializing the vocabulary of a multilingual LLM to one of its supported languages. The library currently implements the following methods: @@ -59,6 +59,7 @@ pip install . The following notebooks serve as tutorials for users of the package: - [WECHSEL Tutorial](notebooks/WECHSEL_tutorial.ipynb) +- [CLP-Transfer Tutorial](notebooks/CLP_Transfer_tutorial.ipynb) ### Example @@ -127,11 +128,11 @@ Refer to the [contributing guide](CONTRIBUTING.md) for instructions on you can m ## Logo -The langsfer logo was created by my good friend [Zakaria Taleb Hacine](https://behance.net/zakariahacine), a 3D artist with +The langsfer logo was created by my good friend [Zakaria Taleb Hacine](https://behance.net/zakariahacine), a 3D artist with industry experience and a packed portfolio. The logo contains the latin alphabet letters A and I which are an acronym for Artificial Intelligence and the arabic alphabet letters -أ and ذ which are an acronym for الذكاء الاصطناعي, which is Artificial Intelligence in arabic. +أ and ذ which are an acronym for ذكاء اصطناعي, which is Artificial Intelligence in arabic. The fonts used are [Ethnocentric Regular](https://www.myfonts.com/products/ethnocentric-ethnocentric-970121) and [Readex Pro](https://fonts.google.com/specimen/Readex+Pro).