diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
index 73804d6c..28b70dd9 100644
--- a/.github/workflows/3d_parallelism_unit_tests.yaml
+++ b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -45,6 +45,7 @@ jobs:
         pip install -e .
         pip install -e .[dev]
         pip install -e .[test]
+        pip install -e .[nanosets]
 
     - name: Show installed libraries and their versions
       run: pip freeze | tee installed.txt
diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml
index cc8e58ee..342be45e 100644
--- a/.github/workflows/fa2_unit_tests.yaml
+++ b/.github/workflows/fa2_unit_tests.yaml
@@ -39,7 +39,7 @@ jobs:
         python -c "import torch; print('torch:', torch.__version__, torch)"
         python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
 
-    - name: Instal nanotron
+    - name: Install nanotron
       run: |
         python -m pip install --upgrade pip
         pip install packaging
@@ -55,4 +55,4 @@ jobs:
     - name: Run tests
       # NOTE: -m fa2 will only run the unit tests that have the mark
       # "fa2" (these are FA2-related tests)
-      run: pytest -m fa2 --color=yes --durations=0 --ignore tests/fp8 --verbose tests/
+      run: pytest -m fa2 --color=yes --durations=0 --ignore tests/fp8 --ignore tests/nanoset --verbose tests/
diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
new file mode 100644
index 00000000..9cbbf680
--- /dev/null
+++ b/.github/workflows/trufflehog.yml
@@ -0,0 +1,15 @@
+on:
+  push:
+
+name: Secret Leaks
+
+jobs:
+  trufflehog:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+    - name: Secret Scanning
+      uses: trufflesecurity/trufflehog@main
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 614ac177..5174d157 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,4 +33,4 @@ repos:
       - id: codespell
         args:
           - -w
-          - --ignore-words-list=nd,reacher,thist,ths,magent,ba,fo
+          - --ignore-words-list=nd,reacher,thist,ths,magent,ba,fo,doesnt
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..ce886214
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 00000000..997ad347
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,224 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# How to contribute to 🤗 Nanotron?
+
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+is thus not the only way to help the community. Answering questions, helping
+others, reaching out and improving the documentations are immensely valuable to
+the community.
+
+It also helps us if you spread the word: reference the library from blog posts
+on the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply star the repo to say "thank you".
+
+Whichever way you choose to contribute, please be mindful to respect our
+[code of conduct](CODE_OF_CONDUCT.md).
+
+## You can contribute in so many ways!
+
+Some of the ways you can contribute to nanotron:
+* Fixing outstanding issues with the existing code;
+* Contributing to the examples or to the documentation;
+* Submitting issues related to bugs or desired new features.
+
+## Submitting a new issue or feature request
+
+Do your best to follow these guidelines when submitting an issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The 🤗 Nanotron library is robust and reliable thanks to the users who notify us of
+the problems they encounter. So thank you for reporting an issue.
+
+First, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on Github under Issues).
+
+Did not find it? :( So we can act quickly on it, please follow these steps:
+
+* Include your **OS type and version**, the versions of **Python** and **PyTorch**.
+* A short, self-contained, code snippet that allows us to reproduce the bug in
+  less than 30s;
+* Provide your Nanotron configuration used for the run;
+* Describe the expected behavior and the actual behavior;
+
+### Do you want a new feature?
+
+A good feature request addresses the following points:
+
+1. Motivation first:
+* Is it related to a problem/frustration with the library? If so, please explain
+  why. Providing a code snippet that demonstrates the problem is best.
+* Is it related to something you would need for a project? We'd love to hear
+  about it!
+* Is it something you worked on and think could benefit the community?
+  Awesome! Tell us what problem it solved for you.
+2. Write a *full paragraph* describing the feature;
+3. Provide a **code snippet** that demonstrates its future use;
+4. In case this is related to a paper, please attach a link;
+5. Attach any additional information (drawings, screenshots, etc.) you think may help.
+
+If your issue is well written we're already 80% of the way there by the time you
+post it.
+
+## Submitting a pull request (PR)
+
+Before writing code, we strongly advise you to search through the existing PRs or
+issues to make sure that nobody is already working on the same thing. If you are
+unsure, it is always a good idea to open an issue to get some feedback.
+
+You will need basic `git` proficiency to be able to contribute to
+🤗 Nanotron. `git` is not the easiest tool to use but it has the greatest
+manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
+Git](https://git-scm.com/book/en/v2) is a very good reference.
+
+Follow these steps to start contributing:
+
+1. Fork the [repository](https://github.com/huggingface/nanotron) by
+   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
+   under your GitHub user account.
+
+2. Clone your fork to your local disk, and add the base repository as a remote. The following command
+   assumes you have your public SSH key uploaded to GitHub. See the following guide for more
+   [information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
+
+   ```bash
+   $ git clone git@github.com:<your Github handle>/nanotron.git
+   $ cd nanotron
+   $ git remote add upstream https://github.com/huggingface/nanotron.git
+   ```
+
+3. Create a new branch to hold your development changes, and do this for every new PR you work on.
+
+   Start by synchronizing your `main` branch with the `upstream/main` branch (ore details in the [GitHub Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork)):
+
+   ```bash
+   $ git checkout main
+   $ git fetch upstream
+   $ git merge upstream/main
+   ```
+
+   Once your `main` branch is synchronized, create a new branch from it:
+
+   ```bash
+   $ git checkout -b a-descriptive-name-for-my-changes
+   ```
+
+   **Do not** work on the `main` branch.
+
+4. Set up a development environment by running the following command in a conda or a virtual environment you've created for working on this library:
+
+   ```bash
+   $ pip install -e ".[dev]"
+   $ pip install -e ".[test]"
+   $ pre-commit install
+   ```
+
+   (If nanotron was already installed in the virtual environment, remove
+   it with `pip uninstall nanotron` before reinstalling it in editable
+   mode with the `-e` flag.)
+
+   Alternatively, if you are using [Visual Studio Code](https://code.visualstudio.com/Download), the fastest way to get set up is by using
+   the provided Dev Container. Documentation on how to get started with dev containers is available [here](https://code.visualstudio.com/docs/remote/containers).
+
+5. Develop the features on your branch.
+
+   As you work on the features, you should make sure that the test suite
+   passes. You should run the tests impacted by your changes like this (see
+   below an explanation regarding the environment variable):
+
+   ```bash
+   $ pytest tests/<TEST_TO_RUN>.py
+   ```
+
+   `nanotron` relies on `ruff` to format its source code
+   consistently. After you make changes, apply automatic style corrections and code verifications
+   that can't be automated in one go with:
+
+   This target is also optimized to only work with files modified by the PR you're working on.
+
+   If you prefer to run the checks one after the other, the following command apply the
+   style corrections:
+
+   ```bash
+   $ pre-commit run --all-files
+   ```
+
+   Once you're happy with your changes, add changed files using `git add` and
+   make a commit with `git commit` to record your changes locally:
+
+   ```bash
+   $ git add modified_file.py
+   $ git commit
+   ```
+
+   Please write [good commit messages](https://chris.beams.io/posts/git-commit/).
+
+   It is a good idea to sync your copy of the code with the original
+   repository regularly. This way you can quickly account for changes:
+
+   ```bash
+   $ git fetch upstream
+   $ git rebase upstream/main
+   ```
+
+   Push the changes to your account using:
+
+   ```bash
+   $ git push -u origin a-descriptive-name-for-my-changes
+   ```
+
+6. Once you are satisfied (**and the checklist below is happy too**), go to the
+   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
+   to the project maintainers for review.
+
+7. It's ok if maintainers ask you for changes. It happens to core contributors
+   too! So everyone can see the changes in the Pull request, work in your local
+   branch and push the changes to your fork. They will automatically appear in
+   the pull request.
+
+
+### Checklist
+
+1. The title of your pull request should be a summary of its contribution;
+2. If your pull request addresses an issue, please mention the issue number in
+   the pull request description to make sure they are linked (and people
+   consulting the issue know you are working on it);
+3. To indicate a work in progress please prefix the title with `[WIP]`, or mark
+   the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate
+   it from PRs ready to be merged;
+4. Make sure existing tests pass;
+5. Add high-coverage tests. No quality testing = no merge.
+
+See an example of a good PR here: https://github.com/huggingface/nanotron/pull/155
+
+### Tests
+
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
+the [tests folder](https://github.com/huggingface/nanotron/tree/main/tests).
+
+We use `pytest` in order to run the tests. From the root of the
+repository, here's how to run tests with `pytest` for the library:
+
+```bash
+# Runs all tests (where 12 of which run in parallel)
+$ pytest -n 12 tests
+```
+
+You can specify a smaller set of tests in order to test only the feature
+you're working on.
diff --git a/Makefile b/Makefile
index b9e18168..0ab20da6 100644
--- a/Makefile
+++ b/Makefile
@@ -14,3 +14,9 @@ test:
         --ignore tests/fp8 \
         --verbose \
         examples/doremi/tests/
+
+	pip install -r examples/llama/requirements.txt
+	pytest \
+        --color=yes \
+        --verbose \
+        examples/llama/tests/
diff --git a/README.md b/README.md
index b5748d60..7a22f12a 100644
--- a/README.md
+++ b/README.md
@@ -11,111 +11,93 @@
 
 <h4 align="center">
     <p>
-        <a href="#Philosophy">Philosophy</a> •
-        <a href="#Core-Features">Core Features</a> •
-        <a href="#Installation">Installation</a> •
-        <a href="#Quick-examples">Usage</a> •
-        <a href="#Development-guidelines">Contributions</a> •
-        <a href="docs/debugging.md">Debugging</a>
+        <a href="#installation">Installation</a> •
+        <a href="#quick-start">Quick Start</a> •
+        <a href="#features">Features</a> •
+        <a href="CONTRIBUTING.md">Contributing</a>
     <p>
 </h4>
 
 <h3 align="center">
     <a href="https://huggingface.co/nanotron"><img style="float: middle; padding: 10px 10px 10px 10px;" width="60" height="55" src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png" /></a>
 </h3>
+<h3 align="center">
+<p>Pretraining models made easy
+</h3>
 
 
+Nanotron is a library for pretraining transformer models. It provides a simple and flexible API to pretrain models on custom datasets. Nanotron is designed to be easy to use, fast, and scalable. It is built with the following principles in mind:
 
-#
-
-The objective of this library is to provide easy distributed primitives in order to train a variety of models efficiently using 3D parallelism. For more information about the internal design of the library or 3D parallelism in general, please check out [[docs.md]](./docs/docs.md) and [[3d_parallelism.md]](./docs/3d_parallelism.md).
-
-
-# Philosophy
-
-- Make it fast. At least as fast as other open source versions.
-- Make it minimal. We don't actually need to support all techniques and all versions of 3D parallelism. What matters is that we can efficiently use the "best" ones.
-- Make everything explicit instead of transparent. As we move forward, making things transparent works well when it works well but is a horrible debugging experience if one doesn't understand the implications of techniques used. In order to mitigate this, we choose to be explicit in the way it does things
-
-# Core Features
-
-We support the following:
- - 3D parallelism, including one-forward-one-backward pipeline engine
- - ZeRO-1 optimizer
- - FP32 gradient accumulation
- - Parameter tying/sharding
- - Spectral µTransfer parametrization for scaling up neural networks
-
-# Installation
+- **Simplicity**: Nanotron is designed to be easy to use. It provides a simple and flexible API to pretrain models on custom datasets.
+- **Performance**: Optimized for speed and scalability, Nanotron uses the latest techniques to train models faster and more efficiently.
 
-Requirements:
- - Python >= 3.10
- - PyTorch >= 2.0.0
- - Flash-Attention >= 2.5.0
+## Installation
 
-To install (in a new env):
 ```bash
-pip install torch
-pip install packaging; pip install "flash-attn>=2.5.0"  --no-build-isolation
-pip install nanotron
+# Requirements: Python>=3.10
+git clone https://github.com/huggingface/nanotron
+cd nanotron
+pip install --upgrade pip
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+pip install -e .
+
+# Install dependencies if you want to use the example scripts
+pip install datasets transformers
+pip install triton "flash-attn>=2.5.0" --no-build-isolation
 ```
+> [!NOTE]
+> If you get `undefined symbol: ncclCommRegister` error you should install torch 2.1.2 instead: `pip install torch==2.1.2 --index-url https://download.pytorch.org/whl/cu121`
 
-Also nice to have: `pip install transformers datasets python-etcd tensorboardX`
-
-We also support a set of flavors that you can install using `pip install -e [$FLAVOR]`:
- - `dev`: Used is you are developping in `nanotron`. It installs in particular our linter mechanism. On top of that you have to run `pre-commit install` afterwards.
- - `test`: We use `pytest` in order to run out testing suite. In order to run tests in parallel, it will install `pytest-xdist`, which you can leverage by running `pytest -n 12 tests` (12 is the number of parallel test)
-
-
-# Quick examples
-
-In the `/examples` directory, you can find a few example configuration file, and a script to run it.
-
-You can run a sample training using:
-```bash
-torchrun --nproc_per_node=8 run_train.py --config-file examples/train_tiny_llama.sh
-```
+> [!TIP]
+> We log to wandb automatically if it's installed. For that you can use `pip install wandb`. If you don't want to use wandb, you can run `wandb disabled`.
 
-And run a sample generation using:
+## Quick Start
+### Training a tiny Llama model
+The following command will train a tiny Llama model on a single node with 8 GPUs. The model will be saved in the `checkpoints` directory as specified in the config file.
 ```bash
-torchrun --nproc_per_node=8 run_generation.py --ckpt-path checkpoints/text/4
+CUDA_DEVICE_MAX_CONNECTIONS=1 torchrun --nproc_per_node=8 run_train.py --config-file examples/config_tiny_llama.yaml
 ```
 
-# Development guidelines
-
-If you plan on developing on `nanotron`, we suggest you install the `dev` flavor: `pip install -e ".[dev]"`
-
-We use pre-commit to run a bunch of callbacks on each commit, mostly normalization code in order for the codebase to stay consistent. Please do run `pre-commit install`.
-
-For the linting:
+### Run generation from your checkpoint
 ```bash
-pre-commit install
-pre-commit run --config .pre-commit-config.yaml --all-files
+torchrun --nproc_per_node=1 run_generate.py --ckpt-path checkpoints/10/ --tp 1 --pp 1
+# We could set a larger TP for faster generation, and a larger PP in case of very large models.
 ```
 
-*As a part of making sure we aren't slowed down as the codebase grows, we will not merge a PR if the features it introduces do not have test coverage.*
-
-We have extensions built on top of Nanotron, with their tests located in the `/examples` folder. Since VSCode defaults to discovering tests only in the `/tests` folder, please run tests from both `/examples` and `/tests` to ensure your PR does not break these extensions. Please run `make tests` to execute all the nanotron tests and the tests in the `/examples` directory that you need to pass.
-
-Features we would like to add:
-- [ ] Support `torch.compile`
-- [ ] More optimized kernels
-- [ ] Support Zero3
-- [ ] Other PP schedules (such as Interleaved 1f1b...)
-- [ ] Ring attention / Sequence Parallelism
-- [ ] 3D Parallel MoEs
-- [ ] Supporting more architectures (Mamba..)
-- [ ] ...
-
-
-# Useful scripts
-- `scripts/log_lighteval_to_wandb.py`: logs the evaluation results of LightEval to wandb, including summary statistics.
-
-
-# Environment Variables
-- `NANOTRON_BENCHMARK=1`: if you want to log the throughput during training
-
-
-# Credits
-
-We would like to thank everyone working on LLMs, especially those sharing their work openly from which we took great inspiration: Nvidia for `Megatron-LM/apex`, Microsoft for `DeepSpeed`, HazyResearch for `flash-attn`
+### Custom examples
+You can find more examples in the [`/examples`](/examples) directory:
+<!-- Make a table of the examples we support -->
+| Example | Description |
+| --- | --- |
+| `custom-dataloader` | Plug a custom dataloader to nanotron |
+| `datatrove` | Use the datatrove library to load data |
+| `doremi` | Use DoReMi to speed up training |
+| `mamba` | Train an example Mamba model |
+| `moe` | Train an example Mixture-of-Experts (MoE) model |
+| `mup` | Use spectral µTransfer to scale up your model |
+
+We're working on adding more examples soon! Feel free to add a PR to add your own example. 🚀
+
+
+## Features
+We currently support the following features:
+- [x] 3D parallelism (DP+TP+PP)
+- [x] Expert parallelism for MoEs
+- [x] AFAB and 1F1B schedules for PP
+- [x] Explicit APIs for TP and PP which enables easy debugging
+- [x] ZeRO-1 optimizer
+- [x] FP32 gradient accumulation
+- [x] Parameter tying/sharding
+- [x] Custom module checkpointing for large models
+- [x] Spectral µTransfer parametrization for scaling up neural networks
+- [x] Mamba example
+
+And we have on our roadmap:
+- [ ] FP8 training
+- [ ] ZeRO-3 optimizer (a.k.a FSDP)
+- [ ] `torch.compile` support
+- [ ] Ring attention
+- [ ] Interleaved 1f1b schedule
+
+## Credits
+We would like to thank everyone working on LLMs, especially those sharing their work openly from which we took great inspiration: Nvidia for `Megatron-LM/apex`, Microsoft for `DeepSpeed`, HazyResearch for `flash-attn`..
diff --git a/docs/nanoset.md b/docs/nanoset.md
new file mode 100644
index 00000000..02649bd0
--- /dev/null
+++ b/docs/nanoset.md
@@ -0,0 +1,131 @@
+# Nanosets
+Nanotron incorporates [`Nanosets`](../src/nanotron/data/nanoset.py), a kind of datasets based on [numpy memory-mapped arrays](https://numpy.org/doc/stable/reference/generated/numpy.memmap.html). `Nanosets` are capable of serving batches from files containing pre-tokenized datasets. They allow reading tokens from one or multiple datasets and even specifying the weight of each dataset when building batches.
+## Install
+To use `Nanosets`, it's necessary to install Nanotron with the `nanosets` flavor.
+```
+pip install -e '.[nanosets]'
+```
+This will install the following dependencies:
+- `transformers`: To tokenize the datasets
+- `datasets`: To preprocess the datasets
+- `numba`: To compile helper functions in order to speed up the creation of `Nanosets`
+## Data pre-processing
+To use these datasets, first, we need to preprocess the data. The input format can either be a column of a Hugging Face Dataset or a .json file containing a text sample per line. For example:
+
+<pre>
+{"src": "www.nvidia.com", "text": "The quick brown fox", "type": "Eng", "id": "0", "title": "First Part"}
+{"src": "The Internet", "text": "jumps over the lazy dog", "type": "Eng", "id": "42", "title": "Second Part"}
+</pre>
+
+The preprocessing is done using the [`tools/preprocess_data.py`](../tools/preprocess_data.py) script. Below we show an example for processing a corpus with the Llama2 tokenizer.
+
+<pre>
+torchrun --nproc-per-node 16 tools/preprocess_data.py \
+       --input HuggingFaceH4/testing_alpaca_small \
+       --split train \
+       --column completion \
+       --output-prefix datasets/testing_alpaca_small \
+       --tokenizer-name-or-path openai-community/gpt2
+</pre>
+
+The preprocessing script has to be launched with `torchrun` in order to spawn `--nproc-per-node` workers that will preprocess the dataset concurrently. The `--input` dataset can be either a Hugging Face Dataset from the Hub or a `.json` file. The processed dataset will be stored in *`--output-prefix`_input_ids.npy*. In `--tokenizer-name-or-path`, we will have to specify a tokenizer in the same way as we do when using `AutoTokenizers.from_pretrained(...)`.
+
+The output will be one file named, in this case, `datasets/testing_alpaca_small_input_ids.npy`. We will then have to specify this file in the `dataset_path` field in the config file.
+
+## Working with Nanosets
+
+To work with `Nanosets`, we just need to configure 1 argument:
+1. `dataset_path`: This argument specifies the file or files that will compose the `Nanoset`. There are 3 ways to specify it:
+   1. If we specify a single path, we will create a `Nanoset` from a single dataset file.
+    ```yaml
+    data_stages:
+      - name: General purpose training (Single dataset)
+        start_training_step: 1
+        data:
+          dataset:
+            dataset_path: datasets/SlimPajama-6B_input_ids.npy
+          num_loading_workers: 0
+          seed: 1234
+    ```
+   2. If we specify a list of paths, we will create a `Nanoset` from all the dataset files. In every epoch we will consume each and every sample from each dataset randomly.
+    ```yaml
+    data_stages:
+      - name: Second purpose training (> 1 dataset)
+        start_training_step: 15
+        data:
+          dataset:
+            dataset_path:
+            - datasets/SlimPajama-6B_input_ids.npy
+            - datasets/testing_alpaca_small_input_ids.npy
+          num_loading_workers: 0
+          seed: 1234
+    ```
+    3. If we specify a dictionary with paths and weights, we will create a `Nanoset` from the dataset files where each epoch will have a number of samples from each dataset according to the specified weights.
+    ```yaml
+    data_stages:
+      - name: Third purpose training (Blended dataset)
+        start_training_step: 25
+        data:
+          dataset:
+            dataset_path:
+              datasets/SlimPajama-6B_input_ids.npy: 0.8
+              datasets/testing_alpaca_small_input_ids.npy: 0.2
+          num_loading_workers: 0
+          seed: 1234
+    ```
+> [!IMPORTANT]
+> Remember to set the `tokenizer.tokenizer_name_or_path` in the config file to the tokenizer used to preprocess the documents and set the `model.model_config.vocab_size` accordingly.
+
+Finally, to use the `Nanosets`, launch the training with [`run_train.py`](../run_train.py).
+```shell
+torchrun --nproc-per-node 8 run_train.py --config configs/config_nanoset.yaml
+```
+
+## Under the hood
+`Nanosets` are responsible of building samples of `sequence length + 1` tokens from the preprocessed dataset files. The `dataset lengths` of each dataset will be determined by the `(dataset_number_of_tokens - 1) / sequence length`, discarding the last sample if its length < `sequence length`.
+
+Based on the `dataset lengths`, the `dataset weights` and the `number of samples per epoch` (defined as the `sum(dataset lengths)`), we build the two indexes we need in order to extract samples from the `Nanoset`  ([build_nanoset_index_helper](../src/nanotron/data/nanoset.py)):
+- `dataset index`: Contains the index of the dataset from the list of `dataset paths` from which to extract the sample, respecting the established dataset weight.
+```
+Given:
+
+D = [d0, d1, d2, d3]        # datasets
+DL = [8, 2, 5, 5]           # dataset lengths
+W = [0.1, 0.5, 0.3, 0.1]    # dataset weights
+SPE = 20                    # number of samples per epoch
+
+Then, for example:
+
+dataset_index = [1, 2, 0, 1, 3, 1, 2, 1, 2, 1, 0, 1, 2, 1, 3, 1, 2, 1, 2, 1]
+```
+- `dataset sample index`: Contains the sample index to extract from the `dataset index[index]` dataset, always < `len(dataset)`.
+```
+dataset_index =         [1, 2, 0, 1, 3, 1, 2, 1, 2, 1, 0, 1, 2, 1, 3, 1, 2, 1, 2, 1]
+dataset_sample_index =  [0, 0, 0, 1, 0, 0, 1, 1, 2, 0, 1, 1, 3, 0, 1, 1, 4, 0, 0, 1]
+```
+Then, we **shuffle with the same permutation both indexes** and concatenate them `number of epochs` times, which is defined by `train split num samples` / `number of samples per epoch`.
+```
+Given:
+
+N = 70                      # train split num samples
+
+dataset_index =         [1, 2, 0, 1, 3, 1, 2, 1, 2, 1, 0, 1, 2, 1, 3, 1, 2, 1, 2, 1]
+dataset_sample_index =  [0, 0, 0, 1, 0, 0, 1, 1, 2, 0, 1, 1, 3, 0, 1, 1, 4, 0, 0, 1]
+
+Shuffle dataset_index and dataset_sample_index:
+
+dataset_index =         [1, 1, 0, 2, 3, 1, 3, 1, 2, 2, 1, 1, 0, 1, 1, 2, 1, 2, 2, 1]
+dataset_sample_index =  [1, 0, 0, 4, 1, 0, 0, 0, 2, 0, 0, 1, 1, 0, 1, 0, 1, 3, 1, 1]
+
+n_concatenations = (70/(20)) + 1 = 4
+dataset_index = dataset_index concatenated 4 times
+dataset_sample_index = dataset_sample_index concatenated 4 times
+
+dataset_index = dataset_index[: N]
+dataset_sample_index = dataset_sample_index[: N]
+```
+To query the `Nanoset` for the k-th sample we do the following:
+- Use the `dataset_index` to retrieve the corresponding dataset from `D` and the `dataset_sample_index` to retrieve the corresponding sample from that dataset.
+```
+sample = D[dataset_index[k]][dataset_sample_index[k]]
+```
diff --git a/examples/config_nanoset.yaml b/examples/config_nanoset.yaml
new file mode 100644
index 00000000..31f23bf0
--- /dev/null
+++ b/examples/config_nanoset.yaml
@@ -0,0 +1,110 @@
+checkpoints:
+  checkpoint_interval: 1000
+  checkpoints_path: checkpoints/
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_path: datasets/testing_alpaca_small_input_ids.npy
+    num_loading_workers: 1
+    seed: 42
+  name: General purpose training (Single dataset)
+  start_training_step: 1
+- data:
+    dataset:
+      dataset_path:
+      - datasets/yelp_review_full_input_ids.npy
+      - datasets/testing_alpaca_small_input_ids.npy
+    num_loading_workers: 1
+    seed: 42
+  name: Second purpose training (> 1 dataset)
+  start_training_step: 15
+- data:
+    dataset:
+      dataset_path:
+        datasets/testing_alpaca_small_input_ids.npy: 0.8
+        datasets/yelp_review_full_input_ids.npy: 0.2
+    num_loading_workers: 1
+    seed: 42
+  name: Third purpose training (Blended dataset)
+  start_training_step: 25
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: Nanoset
+  run: llama
+  seed: 42
+  step: null
+lighteval: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.025
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 1
+    eos_token_id: 2
+    hidden_act: silu
+    hidden_size: 16
+    initializer_range: 0.02
+    intermediate_size: 64
+    is_llama_config: true
+    max_position_embeddings: 256
+    num_attention_heads: 4
+    num_hidden_layers: 2
+    num_key_value_heads: 4
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 32000
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.0003
+    lr_decay_starting_step: null
+    lr_decay_steps: 98
+    lr_decay_style: cosine
+    lr_warmup_steps: 2
+    lr_warmup_style: linear
+    min_decay_lr: 1.0e-05
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 2
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  tp: 2
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: gpt2
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 1
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 2
+  sequence_length: 128
+  train_steps: 200
+  val_check_interval: -1
diff --git a/examples/config_tiny_llama.py b/examples/config_tiny_llama.py
index 31431956..479e1d47 100644
--- a/examples/config_tiny_llama.py
+++ b/examples/config_tiny_llama.py
@@ -2,6 +2,7 @@
 import os
 
 from nanotron.config import (
+    AdamWOptimizerArgs,
     CheckpointsArgs,
     Config,
     DataArgs,
@@ -62,11 +63,13 @@
     weight_decay=0.01,
     clip_grad=1.0,
     accumulate_grad_in_fp32=True,
-    adam_eps=1e-08,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    torch_adam_is_fused=True,
     learning_rate_scheduler=learning_rate,
+    optimizer_factory=AdamWOptimizerArgs(
+        adam_eps=1e-08,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        torch_adam_is_fused=True,
+    ),
 )
 
 parallelism = ParallelismArgs(
@@ -78,13 +81,28 @@
     tp_linear_async_communication=True,
 )
 
-tokens = TokensArgs(sequence_length=32, train_steps=10, micro_batch_size=2, batch_accumulation_per_replica=1)
+tokens = TokensArgs(sequence_length=256, train_steps=15, micro_batch_size=2, batch_accumulation_per_replica=1)
 
-dataset = PretrainDatasetsArgs(
-    hf_dataset_or_datasets="HuggingFaceH4/testing_alpaca_small", text_column_name="completion"
-)
+data_stages = [
+    DatasetStageArgs(
+        name="Stable Training Stage",
+        start_training_step=1,
+        data=DataArgs(
+            dataset=PretrainDatasetsArgs(hf_dataset_or_datasets="stas/openwebtext-10k", text_column_name="text"),
+            seed=seed,
+        ),
+    ),
+    DatasetStageArgs(
+        name="Annealing Phase",
+        start_training_step=10,
+        data=DataArgs(
+            dataset=PretrainDatasetsArgs(hf_dataset_or_datasets="stas/openwebtext-10k", text_column_name="text"),
+            seed=seed,
+        ),
+    ),
+]
 
-checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
+checkpoints_path = "./checkpoints"
 os.makedirs(checkpoints_path, exist_ok=True)
 
 config = Config(
@@ -92,16 +110,11 @@
     checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=10),
     parallelism=parallelism,
     model=ModelArgs(init_method=RandomInit(std=0.025), model_config=model_config),
-    tokenizer=TokenizerArgs("gpt2"),
+    tokenizer=TokenizerArgs("robot-test/dummy-tokenizer-wordlevel"),
     optimizer=optimizer,
     logging=LoggingArgs(),
     tokens=tokens,
-    data_stages=[
-        DatasetStageArgs(
-            name="Stable Training Stage", start_training_step=1, data=DataArgs(dataset=dataset, seed=seed)
-        ),
-        DatasetStageArgs(name="Annealing Phase", start_training_step=10, data=DataArgs(dataset=dataset, seed=seed)),
-    ],
+    data_stages=data_stages,
     profiler=None,
 )
 
diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml
index bd69e7f8..58645e2d 100644
--- a/examples/config_tiny_llama.yaml
+++ b/examples/config_tiny_llama.yaml
@@ -1,3 +1,34 @@
+checkpoints:
+  checkpoint_interval: 10
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_initial_state: false
+data_stages:
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 1
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: stas/openwebtext-10k
+      hf_dataset_splits: train
+      text_column_name: text
+    num_loading_workers: 1
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 1
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: stas/openwebtext-10k
+      hf_dataset_splits: train
+      text_column_name: text
+    num_loading_workers: 1
+    seed: 42
+  name: Annealing Phase
+  start_training_step: 10
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
@@ -6,24 +37,28 @@ general:
   run: tiny_llama_%date_%jobid
   seed: 42
   step: null
+lighteval: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
 model:
   ddp_bucket_cap_mb: 25
   dtype: bfloat16
   init_method:
     std: 0.025
-    # use_mup: true # uncomment this and comment the std line above to use spectral µTransfer
   make_vocab_size_divisible_by: 1
   model_config:
     bos_token_id: 1
     eos_token_id: 2
     hidden_act: silu
-    hidden_size: 32
+    hidden_size: 16
     initializer_range: 0.02
-    intermediate_size: 128
+    intermediate_size: 64
     is_llama_config: true
     max_position_embeddings: 256
     num_attention_heads: 4
-    num_hidden_layers: 10
+    num_hidden_layers: 2
     num_key_value_heads: 4
     pad_token_id: null
     pretraining_tp: 1
@@ -34,20 +69,22 @@ model:
     vocab_size: 256
 optimizer:
   accumulate_grad_in_fp32: true
-  adam_beta1: 0.9
-  adam_beta2: 0.95
-  adam_eps: 1.0e-08
   clip_grad: 1.0
   learning_rate_scheduler:
-    learning_rate: 0.001
+    learning_rate: 0.0003
     lr_decay_starting_step: null
-    lr_decay_steps: null
+    lr_decay_steps: 13
     lr_decay_style: cosine
-    lr_warmup_steps: 2000 # 20% of the total steps
+    lr_warmup_steps: 2
     lr_warmup_style: linear
     min_decay_lr: 1.0e-05
-  torch_adam_is_fused: true
-  weight_decay: 0.1
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
   zero_stage: 0
 parallelism:
   dp: 2
@@ -57,52 +94,16 @@ parallelism:
   tp: 2
   tp_linear_async_communication: true
   tp_mode: REDUCE_SCATTER
-data_stages:
-  - name: Stable Training Stage
-    start_training_step: 1
-    data:
-      dataset:
-        dataset_overwrite_cache: false
-        dataset_processing_num_proc_per_process: 1
-        hf_dataset_config_name: null
-        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-        hf_dataset_splits: train
-        text_column_name: completion
-      num_loading_workers: 1
-      seed: 42
-  - name: Annealing Phase
-    start_training_step: 10
-    data:
-      dataset:
-        dataset_overwrite_cache: false
-        dataset_processing_num_proc_per_process: 1
-        hf_dataset_config_name: null
-        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-        hf_dataset_splits: train
-        text_column_name: completion
-      num_loading_workers: 1
-      seed: 42
-lighteval: null
+profiler: null
 tokenizer:
   tokenizer_max_length: null
-  tokenizer_name_or_path: gpt2
+  tokenizer_name_or_path: robot-test/dummy-tokenizer-wordlevel
   tokenizer_revision: null
 tokens:
   batch_accumulation_per_replica: 1
   limit_test_batches: 0
   limit_val_batches: 0
   micro_batch_size: 2
-  sequence_length: 32
+  sequence_length: 256
   train_steps: 15
   val_check_interval: -1
-checkpoints:
-  checkpoint_interval: 10
-  checkpoints_path: checkpoints
-  checkpoints_path_is_shared_file_system: false
-  resume_checkpoint_path: null
-  save_initial_state: false
-profiler: null
-logging:
-  iteration_step_info_interval: 1
-  log_level: info
-  log_level_replica: info
diff --git a/examples/contributor-guide/debug_config_tiny_llama.py b/examples/contributor-guide/debug_config_tiny_llama.py
index e1e2d065..096995b0 100644
--- a/examples/contributor-guide/debug_config_tiny_llama.py
+++ b/examples/contributor-guide/debug_config_tiny_llama.py
@@ -5,6 +5,7 @@
     CheckpointsArgs,
     Config,
     DataArgs,
+    DatasetStageArgs,
     GeneralArgs,
     LlamaConfig,
     LoggingArgs,
@@ -95,7 +96,12 @@
     optimizer=optimizer,
     logging=LoggingArgs(),
     tokens=tokens,
-    data=DataArgs(dataset=dataset, seed=seed),
+    data_stages=[
+        DatasetStageArgs(
+            name="Stable Training Stage", start_training_step=1, data=DataArgs(dataset=dataset, seed=seed)
+        ),
+        DatasetStageArgs(name="Annealing Phase", start_training_step=10, data=DataArgs(dataset=dataset, seed=seed)),
+    ],
     profiler=None,
 )
 
diff --git a/examples/contributor-guide/debug_config_tiny_llama.yaml b/examples/contributor-guide/debug_config_tiny_llama.yaml
index 27c24ed0..096a49b7 100644
--- a/examples/contributor-guide/debug_config_tiny_llama.yaml
+++ b/examples/contributor-guide/debug_config_tiny_llama.yaml
@@ -1,23 +1,34 @@
 checkpoints:
   checkpoint_interval: 10
-  checkpoints_path: /fsx/ferdinandmom/ferdinand-hf/nanotron/examples/checkpoints
+  checkpoints_path: /fsx/haojun/nanotron_latest/examples/checkpoints
   checkpoints_path_is_shared_file_system: false
   resume_checkpoint_path: null
   save_initial_state: false
-
 data_stages:
-  - name: General purpose training
-    start_training_step: 1
-    data:
-      dataset:
-        dataset_overwrite_cache: false
-        dataset_processing_num_proc_per_process: 1
-        hf_dataset_config_name: null
-        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-        hf_dataset_splits: train
-        text_column_name: completion
-      num_loading_workers: 1
-      seed: 42
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 1
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+      hf_dataset_splits: train
+      text_column_name: completion
+    num_loading_workers: 1
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 1
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+      hf_dataset_splits: train
+      text_column_name: completion
+    num_loading_workers: 1
+    seed: 42
+  name: Annealing Phase
+  start_training_step: 10
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
diff --git a/examples/custom-dataloader/README.md b/examples/custom-dataloader/README.md
new file mode 100644
index 00000000..9ded4b3a
--- /dev/null
+++ b/examples/custom-dataloader/README.md
@@ -0,0 +1,39 @@
+# Use a custom dataloader with Nanotron
+
+This example shows how to use a custom dataloader with Nanotron. We will use a simple dataloader that loads a random tokenized dataset and feeds it to a Nanotron model.
+https://github.com/huggingface/nanotron/blob/2e21db0db46a40bedbd03714616dd0ae4ea75914/examples/custom-dataloader/run_train.py#L72-L84
+
+`DataCollatorForCLM` is a custom data collator that takes a list of input_ids and returns a dictionary with the input_ids and the labels on the ranks which need it. For example `input_ids` are only needed in the first PP rank, while `labels` are needed in the last PP rank.
+
+And to test it out, you should fix your config to have: (example: [config_custom_dl.yaml](config_custom_dl.yaml))
+```yaml
+- data:
+    dataset: null # Custom dataloader will be used
+    num_loading_workers: 1
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+```
+
+To try it out you can run the following command:
+
+```bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
+torchrun --nproc_per_node=2 examples/custom-dataloader/run_train.py --config-file examples/custom-dataloader/config_custom_dl.yaml
+```
+
+## Troubleshooting
+
+### `return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)`
+```
+  File "/fsx/nouamane/projects/nanotron/src/nanotron/parallel/tensor_parallel/nn.py", line 284, in forward
+    out = super().forward(masked_input)
+  File "/fsx/nouamane/miniconda/envs/2-1-cu121/lib/python3.10/site-packages/torch/nn/modules/sparse.py", line 162, in forward
+    return F.embedding(
+  File "/fsx/nouamane/miniconda/envs/2-1-cu121/lib/python3.10/site-packages/torch/nn/functional.py", line 2233, in embedding
+    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
+RuntimeError: CUDA error: device-side assert triggered
+Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
+```
+
+If you encounter an error with `torch.embedding`, it's probable you're feeding a token which is bigger than the model's vocabulary size. Check your model's vocab size and tokenizer
diff --git a/examples/custom-dataloader/config_custom_dl.yaml b/examples/custom-dataloader/config_custom_dl.yaml
new file mode 100644
index 00000000..970e7407
--- /dev/null
+++ b/examples/custom-dataloader/config_custom_dl.yaml
@@ -0,0 +1,103 @@
+checkpoints:
+  checkpoint_interval: 10
+  checkpoints_path: checkpoints
+  checkpoints_path_is_shared_file_system: false
+  resume_checkpoint_path: null
+  save_initial_state: false
+data_stages:
+- data:
+    dataset: null # Custom dataloader will be used
+    num_loading_workers: 1
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 1
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: stas/openwebtext-10k
+      hf_dataset_splits: train
+      text_column_name: text
+    num_loading_workers: 1
+    seed: 42
+  name: Annealing Phase
+  start_training_step: 10
+general:
+  benchmark_csv_path: null
+  consumed_train_samples: null
+  ignore_sanity_checks: true
+  project: debug
+  run: tiny_llama_%date_%jobid
+  seed: 42
+  step: null
+lighteval: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
+model:
+  ddp_bucket_cap_mb: 25
+  dtype: bfloat16
+  init_method:
+    std: 0.025
+  make_vocab_size_divisible_by: 1
+  model_config:
+    bos_token_id: 1
+    eos_token_id: 2
+    hidden_act: silu
+    hidden_size: 16
+    initializer_range: 0.02
+    intermediate_size: 64
+    is_llama_config: true
+    max_position_embeddings: 256
+    num_attention_heads: 4
+    num_hidden_layers: 2
+    num_key_value_heads: 4
+    pad_token_id: null
+    pretraining_tp: 1
+    rms_norm_eps: 1.0e-05
+    rope_scaling: null
+    tie_word_embeddings: true
+    use_cache: true
+    vocab_size: 256
+optimizer:
+  accumulate_grad_in_fp32: true
+  clip_grad: 1.0
+  learning_rate_scheduler:
+    learning_rate: 0.0003
+    lr_decay_starting_step: null
+    lr_decay_steps: 13
+    lr_decay_style: cosine
+    lr_warmup_steps: 2
+    lr_warmup_style: linear
+    min_decay_lr: 1.0e-05
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
+  weight_decay: 0.01
+  zero_stage: 0
+parallelism:
+  dp: 2
+  expert_parallel_size: 1
+  pp: 1
+  pp_engine: 1f1b
+  tp: 1
+  tp_linear_async_communication: true
+  tp_mode: REDUCE_SCATTER
+profiler: null
+tokenizer:
+  tokenizer_max_length: null
+  tokenizer_name_or_path: robot-test/dummy-tokenizer-wordlevel
+  tokenizer_revision: null
+tokens:
+  batch_accumulation_per_replica: 1
+  limit_test_batches: 0
+  limit_val_batches: 0
+  micro_batch_size: 2
+  sequence_length: 256
+  train_steps: 15
+  val_check_interval: -1
diff --git a/examples/custom-dataloader/run_train.py b/examples/custom-dataloader/run_train.py
new file mode 100644
index 00000000..e1995381
--- /dev/null
+++ b/examples/custom-dataloader/run_train.py
@@ -0,0 +1,222 @@
+"""
+Nanotron training script example using a custom dataloader.
+
+Usage:
+```
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # important for some distributed operations
+torchrun --nproc_per_node=2 examples/custom-dataloader/run_train.py --config-file examples/custom-dataloader/config_custom_dl.yaml
+```
+"""
+import argparse
+from typing import Dict, cast
+
+import datasets
+import numpy as np
+from nanotron import logging
+from nanotron.config import (
+    DataArgs,
+    DatasetStageArgs,
+    PretrainDatasetsArgs,
+)
+from nanotron.dataloader import (
+    DataCollatorForCLM,
+    clm_process,
+    get_dataloader_worker_init,
+    get_datasets,
+    get_train_dataloader,
+)
+from nanotron.helpers import (
+    compute_remain_train_steps_of_a_data_stage_from_ckp,
+    get_consumed_train_samples_of_a_data_stage_from_ckp,
+)
+from nanotron.logging import log_rank
+from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
+from nanotron.trainer import DistributedTrainer
+from nanotron.utils import main_rank_first
+from torch.utils.data import DataLoader
+
+try:
+    from huggingface_hub import __version__ as hf_hub_version
+    from transformers import AutoTokenizer
+    from transformers import __version__ as tf_version
+except ImportError:
+    hf_hub_version = None
+    tf_version = None
+
+logger = logging.get_logger(__name__)
+
+
+def get_dataloader_from_data_stage(
+    trainer: DistributedTrainer,
+    data: DataArgs,
+    consumed_train_samples: int,
+    num_remaining_train_steps: int,
+):
+    """
+    Returns a dataloader for a given data stage.
+
+    data: The data configuration for the current stage.
+    consumed_train_samples: The number of samples consumed by the model in the this stage (each stage starts from zero).
+    num_remaining_train_steps: The number of remaining training steps for this stage.
+    """
+    assert consumed_train_samples >= 0, "consumed_train_samples should be greater than 0"
+    assert num_remaining_train_steps >= 0, "num_remaining_train_steps should be greater than 0"
+
+    # First, we need to know which ranks to feed the dataloader to
+    input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
+
+    # Case 1: custom data generator
+    if data.dataset is None:
+        log_rank("Using custom data generator", logger=logger, level=logging.INFO, rank=0)
+
+        ###########################################################################################################
+        # This can be replaced with your own tokenized data generator
+        ###########################################################################################################
+        train_dataset = datasets.Dataset.from_dict(
+            {
+                "input_ids": np.random.randint(
+                    0,
+                    trainer.config.model.model_config.vocab_size,
+                    (trainer.global_batch_size * num_remaining_train_steps, trainer.sequence_length + 1),
+                ),
+            }
+        )
+        ###########################################################################################################
+
+        data_collator = DataCollatorForCLM(
+            sequence_length=trainer.sequence_length,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            parallel_context=trainer.parallel_context,
+        )
+
+        return DataLoader(
+            train_dataset,
+            batch_size=trainer.micro_batch_size,
+            collate_fn=data_collator,
+            drop_last=True,
+            num_workers=0,
+            pin_memory=True,
+            worker_init_fn=get_dataloader_worker_init(dp_rank=trainer.parallel_context.dp_pg.rank()),
+        )
+
+    # Case 2: HuggingFace datasets
+    elif isinstance(data.dataset, PretrainDatasetsArgs):
+        log_rank("Using `datasets` library", logger=logger, level=logging.INFO, rank=0)
+        tokenizer_path = trainer.config.tokenizer.tokenizer_name_or_path
+        log_rank(
+            f"Loading tokenizer from {tokenizer_path} and transformers/hf_hub versions {tf_version, hf_hub_version}",
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
+        )
+
+        # We need to the 1st device to process dataset and cache it, then other devices load from cache
+        with main_rank_first(trainer.parallel_context.world_pg):
+            # We load the raw dataset
+            raw_dataset = get_datasets(
+                hf_dataset_or_datasets=data.dataset.hf_dataset_or_datasets,
+                hf_dataset_config_name=data.dataset.hf_dataset_config_name,
+                splits=data.dataset.hf_dataset_splits,
+            )["train"]
+
+            tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+            tokenizer.pad_token = tokenizer.eos_token
+            tokenizer.padding_side = "left"
+
+            # We apply the Causal Language Modeling preprocessing
+            train_dataset = clm_process(
+                raw_dataset=raw_dataset,
+                tokenizer=tokenizer,
+                text_column_name=data.dataset.text_column_name,
+                dataset_processing_num_proc_per_process=data.dataset.dataset_processing_num_proc_per_process,
+                dataset_overwrite_cache=data.dataset.dataset_overwrite_cache,
+                sequence_length=trainer.sequence_length,
+            )
+
+            # We load the processed dataset on the ranks requiring it
+            dataloader = get_train_dataloader(
+                train_dataset=train_dataset,
+                sequence_length=trainer.sequence_length,
+                parallel_context=trainer.parallel_context,
+                input_pp_rank=input_pp_rank,
+                output_pp_rank=output_pp_rank,
+                micro_batch_size=trainer.micro_batch_size,
+                consumed_train_samples=consumed_train_samples,
+                dataloader_num_workers=data.num_loading_workers,
+                seed_worker=data.seed,
+                dataloader_drop_last=True,
+            )
+
+            # Check if we have enough samples for train_steps
+            total_tokens_dataset = len(dataloader.dataset) * trainer.sequence_length
+            num_tokens_needed_for_training = (
+                num_remaining_train_steps * trainer.global_batch_size * trainer.sequence_length
+            )
+            assert num_tokens_needed_for_training <= total_tokens_dataset, (
+                f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
+                f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.iteration_step}"
+            )
+    else:
+        raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}")
+
+    return dataloader
+
+
+def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
+    dataloaders = {}
+
+    for stage_idx, stage in enumerate(trainer.config.data_stages):
+        # NOTE: we only create the dataloader for the first stage,
+        # then we lazy initialize the dataloader for the other stages
+        stage = cast(DatasetStageArgs, stage)
+        consumed_train_samples = get_consumed_train_samples_of_a_data_stage_from_ckp(stage, trainer.metadata)
+        assert (
+            consumed_train_samples is not None
+        ), f"Cannot find consumed_train_samples for stage {stage.start_training_step} in the checkpoint"
+
+        num_remaining_train_steps = compute_remain_train_steps_of_a_data_stage_from_ckp(
+            stage, trainer.config, trainer.metadata
+        )
+        log_rank(
+            f"[Training Plan] Stage {stage.name} has {num_remaining_train_steps} remaining training steps and has consumed {consumed_train_samples} samples",
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
+        )
+
+        dataloader = (
+            get_dataloader_from_data_stage(
+                trainer,
+                stage.data,
+                consumed_train_samples=consumed_train_samples,
+                num_remaining_train_steps=num_remaining_train_steps,
+            )
+            if stage_idx == 0
+            else lambda stage=stage: get_dataloader_from_data_stage(
+                trainer,
+                stage.data,
+                consumed_train_samples=consumed_train_samples,
+                num_remaining_train_steps=num_remaining_train_steps,
+            )
+        )
+        dataloaders[stage.name] = dataloader
+    return dataloaders
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config-file", type=str, required=True, help="Path to the YAML or python config file")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    config_file = args.config_file
+
+    # Load trainer and data
+    trainer = DistributedTrainer(config_file)
+    dataloader = get_dataloader(trainer)
+
+    # Train
+    trainer.train(dataloader)
diff --git a/examples/doremi/README.md b/examples/doremi/README.md
index 5a726bd1..dfc9ea40 100644
--- a/examples/doremi/README.md
+++ b/examples/doremi/README.md
@@ -87,3 +87,7 @@ For evaluation, we do uniform sampling on the test set to evaluate a 2.5B model
 - 2.5B llama trained using the optimized weights: https://huggingface.co/nanotron/doremi-llama-2.5b-optimized-weights
 
 and the dataset: https://huggingface.co/datasets/nanotron/the-pile-for-doremi
+
+#### Thoughts
+
+For DoReMi, it's useful if you don't initially have an idea of what would be a good distribution for your training data, or want a quick way to find a better baseline than the uniform distribution if you want to tune the data distribution by hand. In my previous experiments, DoReMi matched the pretraining performance of the distribution of mamba training but couldn't outperform it. I suspect it doesn't work well when there are nuances, meaning the difference between your known best distribution and a better distribution isn't significant.
diff --git a/examples/llama/README.md b/examples/llama/README.md
new file mode 100644
index 00000000..d8915d38
--- /dev/null
+++ b/examples/llama/README.md
@@ -0,0 +1,17 @@
+## Debugging the tests with vscode
+
+To debug the tests with vscode, add the following json to your `launch.json` file.
+
+```
+{
+    "name": "Test conversion",
+    "type": "python",
+        "request": "launch",
+        "module": "pytest",
+        "console": "integratedTerminal",
+        "args": [
+            "examples/llama/tests"
+        ],
+        "justMyCode": false
+}
+```
diff --git a/examples/llama/__init__.py b/examples/llama/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/llama/convert_hf_to_nanotron.py b/examples/llama/convert_hf_to_nanotron.py
new file mode 100644
index 00000000..9fc81949
--- /dev/null
+++ b/examples/llama/convert_hf_to_nanotron.py
@@ -0,0 +1,119 @@
+"""
+Converts a HF model to nanotron format
+Command:
+    torchrun --nproc_per_node=1 convert_hf_to_nanotron.py --checkpoint_path=hf_weights --save_path=nanotron_weights
+"""
+
+import dataclasses
+import json
+from argparse import ArgumentParser
+from pathlib import Path
+
+import nanotron
+import torch
+from convert_weights import get_config_mapping, get_weight_mapping, load_nanotron_model
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models.llama import LlamaForTraining
+from transformers import LlamaConfig as HFLlamaConfig
+from transformers import LlamaForCausalLM
+
+
+def _handle_attention_block(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, n_q_heads: int, n_kv_heads: int, d_qk: int
+) -> torch.Tensor:
+    # Huggingface Llama separates the q, k, v weights (as opposed to nanotron).
+    # Furthermore, in the rotary embeddings in nanotron expects interleaved pairs of even
+    # and odd dimensions GPT-J style, while the huggingface implementation expects
+    # the whole 1st half and then the whole 2nd half GPT-NeoX style (for more information
+    # see flash_attn.layers.rotary.RotaryEmbedding).
+    # This function handles the concatenation of the q, k, v weights and proper permutation
+    # to ensure correct transformation.
+
+    def interleave(w: torch.Tensor):
+        w_new = []
+        for head_w in w.split(d_qk):
+            head_w = head_w.view(2, d_qk // 2, -1).transpose(0, 1).reshape(d_qk, -1)
+            w_new.append(head_w)
+        return torch.cat(w_new)
+
+    q = interleave(q)
+    k = interleave(k)
+    return torch.cat([q, k, v])
+
+
+def convert_hf_to_nt(model_hf: LlamaForCausalLM, model_nt: LlamaForTraining, config: NanotronLlamaConfig):
+    """Converts the weights from the model_hf to model_nt, making modifications
+    in-place."""
+
+    hf_sd = model_hf.state_dict()
+    nt_to_hf = get_weight_mapping(config, nt_to_hf=True)
+
+    for module_name_nt, module_nt in model_nt.named_modules():
+        for param_name_nt, param_nt in module_nt.named_parameters(recurse=False):
+            # In the case of qkv_proj, the nt_to_hf has exactly three keys, ccorresponding
+            # to q, k, v.
+            if "qkv_proj" in module_name_nt:
+                key_k, key_q, key_v = sorted(nt_to_hf[f"{module_name_nt}.{param_name_nt}"])
+                q = hf_sd[key_q]
+                k = hf_sd[key_k]
+                v = hf_sd[key_v]
+                param = _handle_attention_block(
+                    q,
+                    k,
+                    v,
+                    config.num_attention_heads,
+                    config.num_key_value_heads,
+                    config.hidden_size // config.num_attention_heads,
+                )
+            # The case of gate_up_proj, nt_to_hf_map has two keys.
+            elif "gate_up_proj" in module_name_nt:
+                key_gate, key_up = sorted(nt_to_hf[f"{module_name_nt}.{param_name_nt}"])
+                gate = hf_sd[key_gate]
+                up = hf_sd[key_up]
+                param = torch.cat([gate, up])
+            # All other cases are simple 1-to-1 correspondence.
+            else:
+                hf_key = nt_to_hf[f"{module_name_nt}.{param_name_nt}"]
+                param = hf_sd[hf_key]
+
+            with torch.no_grad():
+                param_nt.copy_(param)
+
+
+def get_nanotron_config(config: HFLlamaConfig) -> NanotronLlamaConfig:
+    """Converts a huggingface configuration to nanotron configuration."""
+    attrs = {key: getattr(config, value) for key, value in get_config_mapping(nt_to_hf=True).items()}
+    return NanotronLlamaConfig(**attrs)
+
+
+def convert_checkpoint_and_save(checkpoint_path: Path, save_path: Path):
+    """Loads the huggingface checkpoint in `checkpoint_path`, creates
+    a new nanotron instance, copies the weights from the huggingface checkpoint
+    and saves the transformed nanotron to `save_path`."""
+
+    # Load huggingface.
+    hf_model = LlamaForCausalLM.from_pretrained(checkpoint_path)
+
+    # Init nanotron model.
+    model_config = get_nanotron_config(hf_model.config)
+    nanotron_model = load_nanotron_model(model_config=model_config)
+
+    # Copy weights and save model.
+    parallel_context = nanotron.parallel.ParallelContext(
+        data_parallel_size=1, pipeline_parallel_size=1, tensor_parallel_size=1
+    )
+    convert_hf_to_nt(hf_model, nanotron_model, model_config)
+    nanotron.serialize.save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=save_path)
+    with open(save_path / "model_config.json", "w+") as f:
+        json.dump(dataclasses.asdict(model_config), f)
+    print(f"Model saved to {save_path}")
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Convert HF weights to nanotron format")
+    parser.add_argument("--checkpoint_path", type=Path, default="llama-7b", help="Path to the checkpoint")
+    parser.add_argument("--save_path", type=Path, default="llama-7b-hf", help="Path to save the nanotron model")
+    args = parser.parse_args()
+
+    # Convert HF model to nanotron format.
+    convert_checkpoint_and_save(checkpoint_path=args.checkpoint_path, save_path=args.save_path)
diff --git a/examples/llama/convert_nanotron_to_hf.py b/examples/llama/convert_nanotron_to_hf.py
new file mode 100644
index 00000000..e11b27da
--- /dev/null
+++ b/examples/llama/convert_nanotron_to_hf.py
@@ -0,0 +1,154 @@
+"""
+Converts a nanotron model to HF format
+Command:
+    torchrun --nproc_per_node=1 convert_nanotron_to_hf.py --checkpoint_path=nanotron-path --save_path=hf-path
+"""
+
+import json
+from argparse import ArgumentParser
+from pathlib import Path
+from typing import Literal, Optional
+
+import torch
+from convert_weights import get_config_mapping, get_weight_mapping, load_nanotron_model
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models import init_on_device_and_dtype
+from nanotron.models.llama import LlamaForTraining
+from transformers import AutoTokenizer, LlamaForCausalLM
+from transformers import LlamaConfig as HFLlamaConfig
+
+TEST_PROMPT = "What is the meaning of the word chutzpah?\nThe word chutzpah means"
+
+
+def _handle_attention_block(
+    qkv: torch.Tensor, part: Literal["q", "k", "v"], n_q_heads: int, n_kv_heads: int, d_qk: int
+) -> torch.Tensor:
+    # Huggingface Llama separates the q, k, v weights (as opposed to nanotron).
+    # Furthermore, in the rotary embeddings in nanotron expects interleaved pairs of even
+    # and odd dimensions GPT-J style, while the huggingface implementation expects
+    # the whole 1st half and then the whole 2nd half GPT-NeoX style (for more information
+    # see flash_attn.layers.rotary.RotaryEmbedding).
+    # This function selects the proper chunk of the bundled qkv tensor and permutation
+    # to ensure correct transformation to huggingface.
+
+    def interleave(w: torch.Tensor):
+        w_new = []
+        for head_w in w.split(d_qk):
+            head_w = head_w.view(d_qk // 2, 2, -1).transpose(0, 1).reshape(d_qk, -1)
+            w_new.append(head_w)
+        return torch.cat(w_new)
+
+    assert part in ["q", "k", "v"], "part must be one of [q, k, v]"
+
+    index_end_q = n_q_heads * d_qk
+    index_end_k = index_end_q + n_kv_heads * d_qk
+    if part == "q":
+        return interleave(qkv[:index_end_q])
+    if part == "k":
+        return interleave(qkv[index_end_q:index_end_k])
+    return qkv[index_end_k:]
+
+
+def _handle_gate_up_proj(gate_up_proj: torch.Tensor, gate: bool) -> torch.Tensor:
+    # The gate and up projection are bundled in nanotron.
+    # This function selects the proper chunk in the bundled weights to return
+    # either the gate or the up projection only.
+    weight_size = gate_up_proj.shape[0] // 2
+    if gate:
+        return gate_up_proj[:weight_size]
+    else:
+        return gate_up_proj[weight_size:]
+
+
+def convert_nt_to_hf(nanotron_model: LlamaForTraining, hf_model: LlamaForCausalLM, model_config: NanotronLlamaConfig):
+    """Converts the weights from the nanotron_model to hf_model, making modifications
+    in-place."""
+
+    nanotron_model_state_dict = nanotron_model.state_dict()
+
+    hf_to_nt = get_weight_mapping(model_config, nt_to_hf=False)
+    for module_name_hf, module_hf in hf_model.named_modules():
+        for param_name_hf, param_hf in module_hf.named_parameters(recurse=False):
+            # Get the Nanotron parameter
+            nanotron_key = hf_to_nt[f"{module_name_hf}.{param_name_hf}"]
+            param = nanotron_model_state_dict[nanotron_key]
+
+            if "qkv_proj" in nanotron_key:
+                proj_name = module_name_hf.split(".")[4][0]
+                param = _handle_attention_block(
+                    param,
+                    proj_name,
+                    model_config.num_attention_heads,
+                    model_config.num_key_value_heads,
+                    model_config.hidden_size // model_config.num_attention_heads,
+                )
+
+            elif "gate_up_proj" in nanotron_key:
+                gate = "gate" in module_name_hf
+                param = _handle_gate_up_proj(param, gate)
+
+            with torch.no_grad():
+                param_hf.copy_(param)
+
+
+def get_hf_config(config: NanotronLlamaConfig) -> HFLlamaConfig:
+    """Converts a nanotron configuration to huggingface configuration."""
+    attrs = {key: getattr(config, value) for key, value in get_config_mapping(nt_to_hf=False).items()}
+    return HFLlamaConfig(**attrs)
+
+
+def convert_checkpoint_and_save(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str] = None):
+    """Loads the nanotron checkpoint in `checkpoint_path`, creates
+    a new huggingface instance, copies the weights from the nanotron checkpoint
+    and saves the transformed huggingface to `save_path`."""
+
+    # Init nanotron model.
+    with open(checkpoint_path / "model_config.json", "r") as f:
+        attrs = json.load(f)
+        model_config = NanotronLlamaConfig(**attrs)
+    nanotron_model = load_nanotron_model(
+        model_config=model_config,
+        checkpoint_path=checkpoint_path,
+    )
+    # Init huggingface model.
+    with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
+        model_config_hf = get_hf_config(model_config)
+        hf_model = LlamaForCausalLM._from_config(model_config_hf)
+
+    # Copy weights, initialize tokenizer and save model.
+    if tokenizer_name is not None:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        tokenizer.save_pretrained(save_path)
+    convert_nt_to_hf(nanotron_model, hf_model, model_config)
+    hf_model.save_pretrained(save_path)
+    print(f"Model saved to {save_path}")
+
+
+def check_converted_model_generation(save_path: Path):
+    """Loads a huggingface model and tokenizer from `save_path` and
+    performs a dummy text generation."""
+
+    tokenizer = AutoTokenizer.from_pretrained(save_path)
+    input_ids = tokenizer(TEST_PROMPT, return_tensors="pt")["input_ids"].cuda()
+    print("Inputs:", tokenizer.batch_decode(input_ids))
+
+    model = LlamaForCausalLM.from_pretrained(save_path).cuda().bfloat16()
+    out = model.generate(input_ids, max_new_tokens=100)
+    print("Generation (converted): ", tokenizer.batch_decode(out))
+
+
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Convert Nanotron weights to HF format")
+    parser.add_argument("--checkpoint_path", type=Path, default="llama-7b", help="Path to the checkpoint")
+    parser.add_argument("--save_path", type=Path, default="llama-7b-hf", help="Path to save the HF model")
+    parser.add_argument("--tokenizer_name", type=str, default="meta-llama/Llama-2-7b-chat-hf")
+    args = parser.parse_args()
+
+    # Convert Nanotron model to HF format.
+    convert_checkpoint_and_save(
+        checkpoint_path=args.checkpoint_path, save_path=args.save_path, tokenizer_name=args.tokenizer_name
+    )
+
+    # Check if the conversion was successful by generating some text.
+    if args.tokenizer_name is not None:
+        check_converted_model_generation(save_path=args.save_path)
diff --git a/examples/llama/convert_weights.py b/examples/llama/convert_weights.py
new file mode 100644
index 00000000..7663399a
--- /dev/null
+++ b/examples/llama/convert_weights.py
@@ -0,0 +1,141 @@
+import json
+from pathlib import Path
+from typing import Optional
+
+import nanotron
+import torch
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models.llama import LlamaForTraining
+from nanotron.trainer import mark_tied_parameters
+
+
+def get_weight_mapping(config: NanotronLlamaConfig, nt_to_hf: bool = True) -> dict[str, str]:
+    """Returns the nanotron to huggingface parameter mapping if `nt_to_hf`, otherwise the
+    huggingface to nanotron mapping."""
+
+    hf_to_nt_map = {}
+    hf_to_nt_map["lm_head.weight"] = "model.lm_head.pp_block.weight"
+    hf_to_nt_map["model.embed_tokens.weight"] = "model.token_position_embeddings.pp_block.token_embedding.weight"
+    hf_to_nt_map["model.norm.weight"] = "model.final_layer_norm.pp_block.weight"
+    hf_to_nt_map["model.embed_tokens.weight"] = "model.token_position_embeddings.pp_block.token_embedding.weight"
+
+    for i in range(config.num_hidden_layers):
+        hf_prefix = f"model.layers.{i}"
+        nt_prefix = f"model.decoder.{i}.pp_block"
+        hf_to_nt_map[f"{hf_prefix}.self_attn.q_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.self_attn.k_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.self_attn.v_proj.weight"] = f"{nt_prefix}.attn.qkv_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.self_attn.o_proj.weight"] = f"{nt_prefix}.attn.o_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.mlp.gate_proj.weight"] = f"{nt_prefix}.mlp.gate_up_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.mlp.gate_proj.bias"] = f"{nt_prefix}.mlp.gate_up_proj.bias"
+        hf_to_nt_map[f"{hf_prefix}.mlp.up_proj.weight"] = f"{nt_prefix}.mlp.gate_up_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.mlp.up_proj.bias"] = f"{nt_prefix}.mlp.gate_up_proj.bias"
+        hf_to_nt_map[f"{hf_prefix}.mlp.down_proj.weight"] = f"{nt_prefix}.mlp.down_proj.weight"
+        hf_to_nt_map[f"{hf_prefix}.mlp.down_proj.bias"] = f"{nt_prefix}.mlp.down_proj.bias"
+        hf_to_nt_map[f"{hf_prefix}.input_layernorm.weight"] = f"{nt_prefix}.input_layernorm.weight"
+        hf_to_nt_map[f"{hf_prefix}.post_attention_layernorm.weight"] = f"{nt_prefix}.post_attention_layernorm.weight"
+
+    if nt_to_hf:
+        nt_to_hf_map = {}
+        for hf, nt in hf_to_nt_map.items():
+            # Because the qkv and gate_up projections are separated in the
+            # huggingface format, when we return nanotron to huggingface
+            # we will need to return a list of parameters instead (e.g.
+            # the `qkv_proj` will point to a list `[q_proj, k_proj, v_proj]`).
+            if nt in nt_to_hf_map and isinstance(nt_to_hf_map[nt], list):
+                nt_to_hf_map[nt].append(hf)
+            elif nt in nt_to_hf_map:
+                nt_to_hf_map[nt] = [nt_to_hf_map[nt], hf]
+            else:
+                nt_to_hf_map[nt] = hf
+        return nt_to_hf_map
+    return hf_to_nt_map
+
+
+def get_config_mapping(nt_to_hf: bool = True) -> dict[str, str]:
+    """Returns either the nanotron to huggingface (if `nt_to_hf`)
+    configuration mapping, or the huggingface to nanotron."""
+
+    hf_to_nt_map = {
+        "bos_token_id": "bos_token_id",
+        "eos_token_id": "eos_token_id",
+        "hidden_act": "hidden_act",
+        "hidden_size": "hidden_size",
+        "initializer_range": "initializer_range",
+        "intermediate_size": "intermediate_size",
+        "max_position_embeddings": "max_position_embeddings",
+        "num_attention_heads": "num_attention_heads",
+        "num_hidden_layers": "num_hidden_layers",
+        "num_key_value_heads": "num_key_value_heads",
+        "pad_token_id": "pad_token_id",
+        "pretraining_tp": "pretraining_tp",
+        "rms_norm_eps": "rms_norm_eps",
+        "rope_scaling": "rope_scaling",
+        "rope_theta": "rope_theta",
+        "tie_word_embeddings": "tie_word_embeddings",
+        "use_cache": "use_cache",
+        "vocab_size": "vocab_size",
+    }
+    if nt_to_hf:
+        return {nt: hf for hf, nt in hf_to_nt_map.items()}
+    return hf_to_nt_map
+
+
+def make_parallel_config(
+    dp: int = 1,
+    pp: int = 1,
+    tp: int = 1,
+):
+    parallel_config = nanotron.config.ParallelismArgs(
+        dp=dp,
+        pp=pp,
+        tp=tp,
+        pp_engine=nanotron.config.AllForwardAllBackwardPipelineEngine(),
+        tp_mode=nanotron.config.TensorParallelLinearMode.ALL_REDUCE,
+        tp_linear_async_communication=False,
+    )
+    return parallel_config
+
+
+def load_nanotron_model(
+    model_config: Optional[NanotronLlamaConfig] = None,
+    device: torch.device = torch.device("cuda"),
+    dtype: torch.dtype = torch.bfloat16,
+    checkpoint_path: Optional[Path] = None,
+) -> LlamaForTraining:
+    """
+    Creates and returns a nanotron model.
+    If `model_config` is None, then `checkpoint_path` must be set, in which case
+    the configuration will be loaded from such path.
+    If `checkpoint_path` is None, then `model_config` must be set, in which case
+    the model created will have random weights.
+    """
+
+    if model_config is None:
+        assert checkpoint_path is not None
+        with open(checkpoint_path / "model_config.json") as f:
+            model_config = NanotronLlamaConfig(**json.load(f))
+    parallel_config = make_parallel_config()
+    parallel_context = nanotron.parallel.ParallelContext(
+        data_parallel_size=parallel_config.dp,
+        pipeline_parallel_size=parallel_config.pp,
+        tensor_parallel_size=parallel_config.tp,
+    )
+    nanotron_model = nanotron.models.build_model(
+        model_builder=lambda: LlamaForTraining(
+            config=model_config,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=None,
+        ),
+        parallel_context=parallel_context,
+        dtype=dtype,
+        device=device,
+    )
+    mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
+    # Load checkpoint directly in memory and then only keep the state dictionary
+    if checkpoint_path is not None:
+        nanotron.serialize.load_weights(
+            model=nanotron_model, parallel_context=parallel_context, root_folder=checkpoint_path
+        )
+    return nanotron_model
diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt
new file mode 100644
index 00000000..44012743
--- /dev/null
+++ b/examples/llama/requirements.txt
@@ -0,0 +1 @@
+transformers==4.39.3
diff --git a/examples/llama/tests/test_conversion.py b/examples/llama/tests/test_conversion.py
new file mode 100644
index 00000000..b5ce3529
--- /dev/null
+++ b/examples/llama/tests/test_conversion.py
@@ -0,0 +1,251 @@
+# ruff: noqa: E402
+import dataclasses
+import json
+from pathlib import Path
+
+import pytest
+import torch
+from transformers import LlamaForCausalLM
+from utils import set_system_path
+
+set_system_path()
+
+import nanotron
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models.base import init_on_device_and_dtype
+from nanotron.models.llama import LlamaForTraining
+from nanotron.parallel import ParallelContext
+from nanotron.trainer import mark_tied_parameters
+
+from examples.llama.convert_hf_to_nanotron import convert_checkpoint_and_save as convert_hf_to_nt_and_save
+from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
+from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
+from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
+from examples.llama.convert_weights import load_nanotron_model, make_parallel_config
+from tests.helpers.context import TestContext
+from tests.helpers.utils import init_distributed
+
+CONFIG = NanotronLlamaConfig(
+    **{
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "hidden_act": "silu",
+        "hidden_size": 512,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "is_llama_config": True,
+        "max_position_embeddings": 128,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 4,
+        "num_key_value_heads": 4,
+        "pad_token_id": None,
+        "pretraining_tp": 1,
+        "rms_norm_eps": 1e-06,
+        "rope_scaling": None,
+        "tie_word_embeddings": False,
+        "use_cache": True,
+        "vocab_size": 4096,
+    }
+)
+
+
+BATCH_SIZE = 3
+SEQUENCE_LENGTH = 5
+ATOL = 0.03
+
+
+def create_nanotron_model(parallel_context: ParallelContext) -> LlamaForTraining:
+    parallel_config = make_parallel_config(
+        tp=parallel_context.tensor_parallel_size,
+        dp=parallel_context.data_parallel_size,
+        pp=parallel_context.pipeline_parallel_size,
+    )
+    nanotron_model = nanotron.models.build_model(
+        model_builder=lambda: LlamaForTraining(
+            config=CONFIG,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=None,
+        ),
+        parallel_context=parallel_context,
+        dtype=torch.bfloat16,
+        device=torch.device("cuda"),
+    )
+    mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
+    return nanotron_model
+
+
+def create_huggingface_model() -> LlamaForCausalLM:
+    config_hf = get_hf_config(CONFIG)
+    with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
+        model_hf = LlamaForCausalLM._from_config(config_hf)
+    return model_hf
+
+
+@pytest.fixture(autouse=True, scope="module")
+def fix_seed():
+    torch.manual_seed(0)
+    yield
+
+
+@pytest.fixture
+def input_ids() -> torch.Tensor:
+    return torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda")
+
+
+def _test_nt_to_hf(parallel_context: ParallelContext, input_ids: torch.Tensor):
+    model_nt = create_nanotron_model(parallel_context)
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+
+
+def test_nt_to_hf(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf)(input_ids=input_ids)
+
+
+def _test_nt_to_hf_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
+    # Create and save nanotron model.
+    model_nt = create_nanotron_model(parallel_context)
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "hf"
+    nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
+    with open(nt_path / "model_config.json", "w+") as f:
+        json.dump(dataclasses.asdict(CONFIG), f)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    del model_nt
+    # Perform conversion.
+    convert_nt_to_hf_and_save(nt_path, hf_path)
+    # Load huggingface and get logits.
+    model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    torch.testing.assert_allclose(logits_nt, logits_hf, atol=ATOL)
+
+
+def test_nt_to_hf_with_files(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf_with_files)(input_ids=input_ids, test_context=TestContext())
+
+
+def _test_hf_to_nt(parallel_context: ParallelContext, input_ids: torch.Tensor):
+    model_nt = create_nanotron_model(parallel_context)
+    model_hf = create_huggingface_model()
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    torch.testing.assert_allclose(logits_hf, logits_nt, atol=ATOL)  
+
+
+def test_hf_to_nt(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt)(input_ids=input_ids)
+
+
+def _test_hf_to_nt_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
+    # Create and save hf model.
+    model_hf = create_huggingface_model()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "hf"
+    model_hf.save_pretrained(hf_path)
+    logits_hf = model_hf(input_ids).logits
+    del model_hf
+    # Perform conversion.
+    convert_hf_to_nt_and_save(hf_path, nt_path)
+    # Load nanotron and get logits.
+    input_mask = torch.ones_like(input_ids)
+    model_nt = load_nanotron_model(checkpoint_path=nt_path)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL)
+
+
+def test_hf_to_nt_with_files(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt_with_files)(input_ids=input_ids, test_context=TestContext())
+
+
+def _test_composed_conversion(parallel_context: ParallelContext):
+    # Get HF statedict.
+    model_hf = create_huggingface_model()
+    hf_sd = {key: val.clone() for key, val in model_hf.state_dict().items()}
+    # Convert once to nanotron, save its statedict.
+    model_nt = create_nanotron_model(parallel_context)
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    nt_sd = {key: val.clone() for key, val in model_nt.state_dict().items()}
+    # Convert back to HF, compare statedicts.
+    del model_hf
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    hf_sd_new = model_hf.state_dict()
+    assert set(hf_sd_new) == set(hf_sd)
+    assert all(torch.all(hf_sd[key] == hf_sd_new[key]) for key in hf_sd_new)
+    # Convert to nanotron one more time, compare statedicts.
+    del model_nt
+    model_nt = create_nanotron_model(parallel_context)
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    nt_sd_new = model_nt.state_dict()
+    assert set(nt_sd_new) == set(nt_sd)
+    assert all(torch.all(nt_sd[key] == nt_sd_new[key]) for key in nt_sd_new)
+
+
+def test_composed_conversion():
+    init_distributed(tp=1, dp=1, pp=1)(_test_composed_conversion)()
+
+
+def _save_parallel_nanotron(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path):
+    # Create and save a parallel model.
+    model_nt = create_nanotron_model(parallel_context)
+    nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
+    with open(nt_path / "model_config.json", "w+") as f:
+        json.dump(dataclasses.asdict(CONFIG), f)
+
+    # Get parallel predictions.
+    input_ids = input_ids.cuda()  # Move them to the current device index.
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    if torch.distributed.get_rank() == 0:
+        torch.save(logits_nt.detach().cpu(), nt_path / "logits.pt")
+
+    # Convert nanotron to hf, load it and compare logits.
+    # hf_path = root/"hf"
+    # convert_nt_to_hf_and_save(nt_path, hf_path)
+    # model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    # logits_hf = model_hf(input_ids).logits
+
+    # assert logits_nt.size() == logits_hf.size()
+    # assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+
+
+def _convert_from_parallel(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path, hf_path: Path):
+    # Convert parallel nanotron to hf, get and save huggingface predictions.
+    convert_nt_to_hf_and_save(nt_path, hf_path)
+    model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    logits_hf = model_hf(input_ids).logits
+    torch.save(logits_hf.detach().cpu(), hf_path / "logits.pt")
+
+
+def test_tensor_parallel_conversion(input_ids: torch.Tensor):
+    # Set up test.
+    test_context = TestContext()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "nanotron"
+
+    # Launch both parts.
+    init_distributed(tp=2, dp=1, pp=1)(_save_parallel_nanotron)(input_ids=input_ids, nt_path=nt_path)
+    assert (nt_path / "logits.pt").exists()
+    init_distributed(tp=1, dp=1, pp=1)(_convert_from_parallel)(input_ids=input_ids, nt_path=nt_path, hf_path=hf_path)
+    assert (hf_path / "logits.pt").exists()
+
+    # Load logits and verify they match.
+    logits_nt = torch.load(nt_path / "logits.pt")
+    logits_hf = torch.load(hf_path / "logits.pt")
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
diff --git a/examples/llama/tests/test_conversion.py.orig b/examples/llama/tests/test_conversion.py.orig
new file mode 100644
index 00000000..af068837
--- /dev/null
+++ b/examples/llama/tests/test_conversion.py.orig
@@ -0,0 +1,264 @@
+# ruff: noqa: E402
+import json
+<<<<<<< HEAD
+from pathlib import Path
+=======
+>>>>>>> main
+
+import pytest
+import torch
+from transformers import LlamaForCausalLM
+from utils import set_system_path
+
+set_system_path()
+
+import nanotron
+from nanotron.config import LlamaConfig as NanotronLlamaConfig
+from nanotron.models.base import init_on_device_and_dtype
+from nanotron.models.llama import LlamaForTraining
+from nanotron.parallel import ParallelContext
+
+from examples.llama.convert_hf_to_nanotron import convert_checkpoint_and_save as convert_hf_to_nt_and_save
+<<<<<<< HEAD
+from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
+from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
+from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
+from examples.llama.convert_weights import load_nanotron_model
+from tests.helpers.context import TestContext
+from tests.helpers.utils import init_distributed
+=======
+from examples.llama.convert_hf_to_nanotron import convert_hf_to_nt
+from examples.llama.convert_nanotron_to_hf import convert_checkpoint_and_save as convert_nt_to_hf_and_save
+from examples.llama.convert_nanotron_to_hf import convert_nt_to_hf, get_hf_config
+from examples.llama.convert_weights import load_nanotron_model, make_parallel_config
+from tests.helpers.context import TestContext
+from tests.helpers.utils import init_distributed, rerun_if_address_is_in_use
+>>>>>>> main
+
+CONFIG = NanotronLlamaConfig(
+    **{
+        "bos_token_id": 1,
+        "eos_token_id": 2,
+        "hidden_act": "silu",
+        "hidden_size": 512,
+        "initializer_range": 0.02,
+        "intermediate_size": 1024,
+        "is_llama_config": True,
+        "max_position_embeddings": 128,
+        "num_attention_heads": 8,
+        "num_hidden_layers": 4,
+        "num_key_value_heads": 4,
+        "pad_token_id": None,
+        "pretraining_tp": 1,
+        "rms_norm_eps": 1e-06,
+        "rope_scaling": None,
+        "tie_word_embeddings": False,
+        "use_cache": True,
+        "vocab_size": 4096,
+    }
+)
+
+
+BATCH_SIZE = 3
+SEQUENCE_LENGTH = 5
+ATOL = 0.02
+
+
+def create_nanotron_model(pp: int = 1, tp: int = 1, dp: int = 1) -> LlamaForTraining:
+    parallel_config = make_parallel_config(dp, pp, tp)
+    return load_nanotron_model(parallel_config, CONFIG, torch.device("cuda"), torch.bfloat16)
+
+
+def create_huggingface_model() -> LlamaForCausalLM:
+    config_hf = get_hf_config(CONFIG)
+    with init_on_device_and_dtype(torch.device("cuda"), torch.bfloat16):
+        model_hf = LlamaForCausalLM._from_config(config_hf)
+    return model_hf
+
+
+@pytest.fixture(autouse=True, scope="module")
+def fix_seed():
+    torch.manual_seed(0)
+    yield
+
+
+@pytest.fixture
+def input_ids() -> torch.Tensor:
+    return torch.randint(0, CONFIG.vocab_size, size=(BATCH_SIZE, SEQUENCE_LENGTH), device="cuda")
+
+
+def _test_nt_to_hf(parallel_context: ParallelContext, input_ids: torch.Tensor):
+    model_nt = create_nanotron_model()
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+
+
+def test_nt_to_hf(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf)(input_ids=input_ids)
+
+
+def _test_nt_to_hf_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
+    # Create and save nanotron model.
+    model_nt = create_nanotron_model()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "hf"
+    nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
+    with open(nt_path / "model_config.json", "w+") as f:
+        json.dump(vars(CONFIG), f)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    del model_nt
+    # Perform conversion.
+    convert_nt_to_hf_and_save(nt_path, hf_path)
+    # Load huggingface and get logits.
+    model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+
+
+def test_nt_to_hf_with_files(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_nt_to_hf_with_files)(input_ids=input_ids, test_context=TestContext())
+
+
+def _test_hf_to_nt(parallel_context: ParallelContext, input_ids: torch.Tensor):
+    model_nt = create_nanotron_model()
+    model_hf = create_huggingface_model()
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+
+
+def test_hf_to_nt(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt)(input_ids=input_ids)
+
+
+def _test_hf_to_nt_with_files(parallel_context: ParallelContext, input_ids: torch.Tensor, test_context: TestContext):
+    # Create and save hf model.
+    model_hf = create_huggingface_model()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path = root / "nanotron"
+    hf_path = root / "hf"
+    model_hf.save_pretrained(hf_path)
+    logits_hf = model_hf(input_ids).logits
+    del model_hf
+    # Perform conversion.
+    convert_hf_to_nt_and_save(hf_path, nt_path)
+    # Load nanotron and get logits.
+    input_mask = torch.ones_like(input_ids)
+    model_nt = load_nanotron_model(checkpoint_path=nt_path)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL)
+
+
+def test_hf_to_nt_with_files(input_ids: torch.Tensor):
+    init_distributed(tp=1, dp=1, pp=1)(_test_hf_to_nt_with_files)(input_ids=input_ids, test_context=TestContext())
+
+
+def _test_composed_conversion(parallel_context: ParallelContext):
+    # Get HF statedict.
+    model_hf = create_huggingface_model()
+    hf_sd = {key: val.clone() for key, val in model_hf.state_dict().items()}
+    # Convert once to nanotron, save its statedict.
+    model_nt = create_nanotron_model()
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    nt_sd = {key: val.clone() for key, val in model_nt.state_dict().items()}
+    # Convert back to HF, compare statedicts.
+    del model_hf
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    hf_sd_new = model_hf.state_dict()
+    assert set(hf_sd_new) == set(hf_sd)
+    assert all(torch.all(hf_sd[key] == hf_sd_new[key]) for key in hf_sd_new)
+    # Convert to nanotron one more time, compare statedicts.
+    del model_nt
+    model_nt = create_nanotron_model()
+    convert_hf_to_nt(model_hf, model_nt, CONFIG)
+    nt_sd_new = model_nt.state_dict()
+    assert set(nt_sd_new) == set(nt_sd)
+    assert all(torch.all(nt_sd[key] == nt_sd_new[key]) for key in nt_sd_new)
+
+
+def test_composed_conversion():
+    init_distributed(tp=1, dp=1, pp=1)(_test_composed_conversion)()
+
+
+<<<<<<< HEAD
+def _save_parallel_nanotron(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path):
+    # Create and save a parallel model.
+    model_nt = create_nanotron_model(tp=parallel_context.tensor_parallel_size, pp=parallel_context.pipeline_parallel_size)
+    # print(torch.distributed.get_rank(), "model_nt", set(p.device for p in model_nt.parameters()))
+    nanotron.serialize.save_weights(model=model_nt, parallel_context=parallel_context, root_folder=nt_path)
+    with open(nt_path/"model_config.json", "w+") as f:
+        json.dump(vars(CONFIG), f)
+
+    # Get parallel predictions.
+    input_ids = input_ids.cuda()  # Move them to the current device index.
+    input_mask = torch.ones_like(input_ids)
+    # print(torch.distributed.get_rank(), "input_ids", input_ids.device)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    if torch.distributed.get_rank() == 0:
+        torch.save(logits_nt.detach().cpu(), nt_path/"logits.pt")
+    # print(torch.distributed.get_rank(), logits_nt.shape)
+
+    # Convert nanotron to hf, load it and compare logits.
+    # hf_path = root/"hf"
+    # convert_nt_to_hf_and_save(nt_path, hf_path)
+    # model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    # logits_hf = model_hf(input_ids).logits
+
+    # assert logits_nt.size() == logits_hf.size()
+    # assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+
+
+def _convert_from_parallel(parallel_context: ParallelContext, input_ids: torch.Tensor, nt_path: Path, hf_path: Path):
+    # Convert parallel nanotron to hf, get and save huggingface predictions.
+    convert_nt_to_hf_and_save(nt_path, hf_path)
+    model_hf = LlamaForCausalLM.from_pretrained(hf_path).cuda()
+    logits_hf = model_hf(input_ids).logits
+    torch.save(logits_hf.detach().cpu(), hf_path/"logits.pt")
+
+def test_tensor_parallel_conversion(input_ids: torch.Tensor):
+    # Set up test.
+    test_context = TestContext()
+    root = test_context.get_auto_remove_tmp_dir()
+    nt_path =root/"nanotron"
+    hf_path =root/"nanotron"
+
+    # Launch both parts.
+    init_distributed(tp=2, dp=1, pp=1)(_save_parallel_nanotron)(input_ids=input_ids, nt_path=nt_path)
+    assert (nt_path/"logits.pt").exists()
+    init_distributed(tp=1, dp=1, pp=1)(_convert_from_parallel)(input_ids=input_ids, nt_path=nt_path, hf_path=hf_path)
+    assert (hf_path/"logits.pt").exists()
+
+    # Load logits and verify they match.
+    logits_nt = torch.load(nt_path/"logits.pt")
+    logits_hf = torch.load(hf_path/"logits.pt")
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+=======
+def _test_tensor_parallel_conversion(parallel_context: ParallelContext):
+    model_nt = create_nanotron_model(tp=2)
+    model_hf = create_huggingface_model()
+    convert_nt_to_hf(model_nt, model_hf, CONFIG)
+    input_mask = torch.ones_like(input_ids)
+    logits_nt = model_nt.model(input_ids, input_mask).permute(1, 0, 2)
+    logits_hf = model_hf(input_ids).logits
+    assert logits_nt.size() == logits_hf.size()
+    assert torch.allclose(logits_nt, logits_hf, atol=ATOL), torch.mean(torch.abs(logits_nt - logits_hf))
+
+
+@rerun_if_address_is_in_use()
+def test_tensor_parallel_conversion():
+    init_distributed(tp=2, dp=1, pp=1)(_test_tensor_parallel_conversion)()
+>>>>>>> main
diff --git a/examples/llama/tests/utils.py b/examples/llama/tests/utils.py
new file mode 100644
index 00000000..6ac3c465
--- /dev/null
+++ b/examples/llama/tests/utils.py
@@ -0,0 +1,15 @@
+import importlib
+import sys
+from pathlib import Path
+
+
+def set_system_path():
+    package = importlib.import_module("nanotron")
+    # NOTE:  Path(package.__file__).parent = .../nanotron/src/nanotron
+    # we want .../nanotron
+    package_path = Path(package.__file__).parent.parent.parent
+    sys.path.insert(0, str(package_path))
+
+    # we also want ../llama
+    llama_path = Path(__file__).parent.parent
+    sys.path.insert(0, str(llama_path))
diff --git a/examples/mamba/README.md b/examples/mamba/README.md
index 5c31d07f..8eefa9c2 100644
--- a/examples/mamba/README.md
+++ b/examples/mamba/README.md
@@ -18,6 +18,18 @@ pip install -r requirements.txt
 
 > https://wandb.ai/bouteille/test/reports/Mamba-loss--Vmlldzo2OTgwNDM5
 
+## Bug related to nanotron
+Encountered the following issue when ran train_mamba.sh:   
+```
+causal_conv1d_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZNK3c1017SymbolicShapeMeta18init_is_contiguousEv
+```
+Solved this by doing:    
+pip uninstall mamba-ssm   
+pip install causal_conv1d==1.1.1   
+pip install mamba-ssm --no-cache-dir  
+https://github.com/state-spaces/mamba/issues/169 
+
+
 ## Credits
 Credits to the following repositories from which the code was adapted:
 - https://github.com/state-spaces/mamba
diff --git a/examples/mamba/convert_hf_to_nanotron.py b/examples/mamba/convert_hf_to_nanotron.py
new file mode 100644
index 00000000..5109e970
--- /dev/null
+++ b/examples/mamba/convert_hf_to_nanotron.py
@@ -0,0 +1,286 @@
+# ruff: noqa: E402
+"""
+Converts a HF model to a Nanotron model
+
+Command:
+    torchrun --nproc_per_node=1 convert_hf_to_nanotron.py --inp_path state-spaces/mamba-130m-hf --out_path nanotron_weights
+"""
+import argparse
+import json
+from dataclasses import asdict
+from pathlib import Path
+from typing import Dict
+
+import torch
+import yaml
+from config import MambaConfig, MambaInit, MambaModelConfig
+from mamba import MambaForTraining
+from nanotron import logging
+from nanotron.config import (
+    AllForwardAllBackwardPipelineEngine,
+    GeneralArgs,
+    LoggingArgs,
+    ModelArgs,
+    ParallelismArgs,
+    TensorParallelLinearMode,
+    TokenizerArgs,
+)
+from nanotron.distributed import dist
+from nanotron.logging import log_rank, set_ranks_logging_level
+from nanotron.models import build_model
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.parameters import NanotronParameter, sanity_check
+from nanotron.serialize import save_meta, save_weights
+from nanotron.trainer import mark_tied_parameters
+from tqdm import tqdm
+from transformers import MambaConfig as HFMambaConfig
+from transformers import MambaForCausalLM
+from transformers.utils import CONFIG_NAME
+from transformers.utils.hub import cached_file
+
+logger = logging.get_logger(__name__)
+
+
+def load_config_hf(model_name):
+    resolved_archive_file = cached_file(model_name, CONFIG_NAME, _raise_exceptions_for_missing_entries=False)
+    return json.load(open(resolved_archive_file))
+
+
+def get_weight_from_hf(
+    name: str,
+    ref_module_state_dict: Dict[str, torch.Tensor],
+    ref_module: MambaForCausalLM,
+    nanotron_to_hf: Dict[str, str],
+    get_grad: bool = False,
+    param_is_tp_sharded: bool = False,
+) -> torch.Tensor:
+    """From our brrr implementation, we get the equivalent tensor in transformers implementation"""
+
+    def _interleave_pattern(N):
+        """
+        interleave_pattern(4) -> [0, 2, 1, 3]
+        interleave_pattern(8) -> [0, 4, 1, 5, 2, 6, 3, 7]
+        """
+        assert N % 2 == 0, "N must be even"
+        pattern = []
+        for i in range(N // 2):
+            pattern.append(i)
+            pattern.append(i + N // 2)
+        return pattern
+
+    hf_name = nanotron_to_hf[name]
+
+    if get_grad is False:
+
+        def _get_tensor(path: str):
+            return ref_module_state_dict[path]
+
+    else:
+
+        def _get_tensor(path: str):
+            param = ref_module.get_parameter(path)
+            return param.grad
+
+    param = _get_tensor(hf_name)
+
+    if "in_proj" in hf_name:
+        # In Nanotron, we do tensor parallel column so weight need to be split in the column dimension (i.e: xz.view(...))
+        # However, the HF weights was trained such that it expected xz.chunk(...) to split the tensor in the row dimension
+        # Thus, we need to interleaved the HF weights to make it compatible with Nanotron
+        log_rank(
+            f"Interleaving {hf_name} to make it compatible with Nanotron", logger=logger, level=logging.INFO, rank=0
+        )
+        return param[_interleave_pattern(param.shape[0]), :]
+
+    return param
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert HF weights from states-space repo to brrr weights")
+    parser.add_argument("--inp_path", type=str, default="state-spaces/mamba-130m-hf")
+    parser.add_argument("--out_path", type=str, default="nanotron_weight")
+    parser.add_argument("--dp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
+    parser.add_argument("--tp", type=int, default=1)
+    args = parser.parse_args()
+
+    out_path = Path(args.out_path)
+
+    parallel_config = ParallelismArgs(
+        dp=args.dp,
+        pp=args.pp,
+        tp=args.tp,
+        pp_engine=AllForwardAllBackwardPipelineEngine(),
+        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
+        tp_linear_async_communication=False,
+    )
+    assert (
+        parallel_config.tp_mode == TensorParallelLinearMode.ALL_REDUCE
+        and parallel_config.tp_linear_async_communication is False
+    )
+
+    parallel_context = ParallelContext(
+        data_parallel_size=parallel_config.dp,
+        pipeline_parallel_size=parallel_config.pp,
+        tensor_parallel_size=parallel_config.tp,
+    )
+
+    # Set log log levels
+    logging_config = LoggingArgs(
+        log_level="info",
+        log_level_replica="info",
+    )
+
+    # Set log levels
+    set_ranks_logging_level(parallel_context=parallel_context, logging_config=logging_config)
+
+    hf_config = HFMambaConfig.from_pretrained(args.inp_path)
+
+    dtype_str = "float32"
+
+    # TODO(fmom): Add support for ssm_cfg
+    yaml_content = f"""
+    is_mamba_config: true
+    d_model: {hf_config.hidden_size}
+    dtype: {dtype_str}
+    fused_add_norm: true
+    is_mamba_config: true
+    num_hidden_layers: {hf_config.num_hidden_layers}
+    pad_token_id: null
+    pad_vocab_size_multiple: 8
+    residual_in_fp32: true
+    rms_norm: true
+    rms_norm_eps: 1.0e-05
+    ssm_cfg: null
+    vocab_size: {hf_config.vocab_size}
+    """
+
+    dtype = getattr(torch, dtype_str)
+    device = torch.device("cuda")
+
+    attrs = yaml.safe_load(yaml_content)
+    model_config = MambaModelConfig(**attrs)
+
+    model_ref = MambaForCausalLM.from_pretrained(args.inp_path)
+    model_ref.to(device, dtype=dtype)
+    model_ref.eval()
+
+    nanotron_model = build_model(
+        model_builder=lambda: MambaForTraining(
+            config=model_config,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=None,
+        ),
+        parallel_context=parallel_context,
+        dtype=dtype,
+        device=device,
+    )
+
+    device_map = {}
+    current_pp_rank = dist.get_rank(parallel_context.pp_pg)
+
+    tied_embs_ranks = [nanotron_model.model.token_position_embeddings.rank, nanotron_model.model.lm_head.rank]
+
+    device_map["backbone.embedding"] = (
+        nanotron_model.model.token_position_embeddings.rank if current_pp_rank in tied_embs_ranks else "meta"
+    )
+
+    for i in range(model_config.num_hidden_layers):
+        device_map[f"backbone.layers[{i}]"] = (
+            nanotron_model.model.decoder[i].rank if current_pp_rank == nanotron_model.model.decoder[i].rank else "meta"
+        )
+
+    device_map["lm_head"] = nanotron_model.model.lm_head.rank if current_pp_rank in tied_embs_ranks else "meta"
+
+    # Get mapping of Nanotron layer to HF layer
+    nanotron_to_hf = {}
+
+    # Static mappings
+    nanotron_to_hf["token_position_embeddings.pp_block.token_embedding.weight"] = "backbone.embeddings.weight"
+    nanotron_to_hf["final_layer_norm.pp_block.weight"] = "backbone.norm_f.weight"
+    nanotron_to_hf["lm_head.pp_block.weight"] = "lm_head.weight"
+
+    # Dynamic mappings within a loop
+    for i in range(model_config.num_hidden_layers):
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.A_log"] = f"backbone.layers.{i}.mixer.A_log"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.D"] = f"backbone.layers.{i}.mixer.D"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.in_proj.weight"] = f"backbone.layers.{i}.mixer.in_proj.weight"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.conv1d.weight"] = f"backbone.layers.{i}.mixer.conv1d.weight"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.conv1d.bias"] = f"backbone.layers.{i}.mixer.conv1d.bias"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.x_proj.weight"] = f"backbone.layers.{i}.mixer.x_proj.weight"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.x_proj.bias"] = f"backbone.layers.{i}.mixer.x_proj.bias"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.dt_proj.weight"] = f"backbone.layers.{i}.mixer.dt_proj.weight"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.dt_proj.bias"] = f"backbone.layers.{i}.mixer.dt_proj.bias"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.out_proj.weight"] = f"backbone.layers.{i}.mixer.out_proj.weight"
+        nanotron_to_hf[f"decoder.{i}.pp_block.mixer.out_proj.bias"] = f"backbone.layers.{i}.mixer.out_proj.bias"
+        nanotron_to_hf[f"decoder.{i}.pp_block.norm.weight"] = f"backbone.layers.{i}.norm.weight"
+
+    # Sync weights
+    ref_state_dict = model_ref.state_dict()
+    for name, param in tqdm(
+        nanotron_model.model.named_parameters(),
+        total=len(list(nanotron_model.model.named_parameters())),
+        desc="Converting",
+    ):
+        param_is_tp_sharded = (
+            isinstance(param, NanotronParameter)
+            and param.is_sharded
+            and parallel_context.world_ranks_to_pg[param.get_sharded_info().global_ranks] == parallel_context.tp_pg
+        )
+
+        ref_param = get_weight_from_hf(
+            name=name,
+            ref_module_state_dict=ref_state_dict,
+            ref_module=model_ref,
+            nanotron_to_hf=nanotron_to_hf,
+            param_is_tp_sharded=param_is_tp_sharded,
+        )
+
+        if param_is_tp_sharded:
+            sharded_info = param.get_sharded_info()
+            # copy param data (not just the reference)
+            with torch.no_grad():
+                for local_global_slices_pair in sharded_info.local_global_slices_pairs:
+                    local_slices = local_global_slices_pair.local_slices
+                    global_slices = local_global_slices_pair.global_slices
+                    param[local_slices].copy_(ref_param[global_slices])
+        else:
+            assert (
+                ref_param.shape == param.shape
+            ), f"Parameter shape don't match for {name}\n{ref_param.shape} != {param.shape}"
+            # copy param data (not just the reference)
+            with torch.no_grad():
+                param.copy_(ref_param)
+                ref_param = None
+                torch.cuda.empty_cache()
+
+    # Marks parameters as NanotronParameters
+    mark_tied_parameters(model=nanotron_model, parallel_context=parallel_context)
+
+    sanity_check(root_module=nanotron_model)
+
+    save_weights(model=nanotron_model, parallel_context=parallel_context, root_folder=out_path)
+    checkpoint_metadata = {
+        "last_train_step": 0,
+        "consumed_train_samples": 0,
+    }
+    save_meta(root_folder=out_path, parallel_context=parallel_context, checkpoint_metadata=checkpoint_metadata)
+
+    if dist.get_rank() == 0:
+        with open(out_path / "config.yaml", "w") as f:
+            config = MambaConfig(
+                general=GeneralArgs(project="test", run="mamba"),
+                parallelism=parallel_config,
+                model=ModelArgs(
+                    init_method=MambaInit(),
+                    model_config=model_config,
+                ),
+                tokenizer=TokenizerArgs(args.inp_path),
+            )
+            log_rank("Saving config ...", logger=logger, level=logging.INFO, rank=0)
+            yaml.dump(config.as_dict(), f)
+
+        with open(out_path / "model_config.json", "w") as f:
+            log_rank("Saving model config ...", logger=logger, level=logging.INFO, rank=0)
+            json.dump(asdict(model_config), f)
diff --git a/examples/mamba/convert_nanotron_to_hf.py b/examples/mamba/convert_nanotron_to_hf.py
new file mode 100644
index 00000000..235d4644
--- /dev/null
+++ b/examples/mamba/convert_nanotron_to_hf.py
@@ -0,0 +1,209 @@
+# ruff: noqa: E402
+"""
+Converts a nanotron model to HF format
+Command:
+    torchrun --nproc_per_node=1 convert_nanotron_to_hf.py --checkpoint_path=nanotron_weights --save_path=HF_weights
+"""
+import argparse
+import json
+from pathlib import Path
+
+import torch
+import yaml
+from config import MambaModelConfig
+from mamba import MambaForTraining
+from nanotron import logging
+from nanotron.config import (
+    AllForwardAllBackwardPipelineEngine,
+    ParallelismArgs,
+    TensorParallelLinearMode,
+)
+from nanotron.models import build_model, init_on_device_and_dtype
+from nanotron.parallel import ParallelContext
+from nanotron.serialize import load_weights
+from nanotron.trainer import mark_tied_parameters
+from transformers import AutoTokenizer, MambaConfig, MambaForCausalLM
+
+logger = logging.get_logger(__name__)
+
+
+def convert_checkpoint_and_save(checkpoint_path: Path, save_path: Path):
+    device = torch.device("cuda")
+
+    with open(checkpoint_path / "config.yaml", "r") as f:
+        attrs = yaml.safe_load(f)
+        tokenizer_name = attrs["tokenizer"]["tokenizer_name_or_path"]
+
+    with open(checkpoint_path / "model_config.json", "r") as f:
+        attrs = json.load(f)
+        model_config = MambaModelConfig(**attrs)
+
+    dtype = getattr(torch, model_config.dtype)
+
+    parallel_config = ParallelismArgs(
+        dp=1,
+        pp=1,
+        tp=1,
+        pp_engine=AllForwardAllBackwardPipelineEngine(),
+        tp_mode=TensorParallelLinearMode.ALL_REDUCE,
+        tp_linear_async_communication=False,
+    )
+
+    parallel_context = ParallelContext(
+        data_parallel_size=1,
+        pipeline_parallel_size=1,
+        tensor_parallel_size=1,
+    )
+
+    model_nanotron = build_model(
+        model_builder=lambda: MambaForTraining(
+            config=model_config,
+            parallel_context=parallel_context,
+            parallel_config=parallel_config,
+            random_states=None,
+        ),
+        parallel_context=parallel_context,
+        dtype=dtype,
+        device=device,
+    )
+
+    mark_tied_parameters(model=model_nanotron, parallel_context=parallel_context)
+
+    # Load checkpoint directly in memory and then only keep the state dictionary
+    load_weights(model=model_nanotron, parallel_context=parallel_context, root_folder=checkpoint_path)
+    model_nanotron_state_dict = model_nanotron.state_dict()
+    del model_nanotron
+
+    # Init the HF mode
+    if model_config.ssm_cfg is None:
+        model_config_hf = MambaConfig(
+            vocab_size=model_config.vocab_size,
+            num_hidden_layers=model_config.num_hidden_layers,
+            residual_in_fp32=model_config.residual_in_fp32,
+            layer_norm_epsilon=model_config.rms_norm_eps,
+            hidden_size=model_config.d_model,
+        )
+    else:
+        model_config_hf = MambaConfig(
+            vocab_size=model_config.vocab_size,
+            num_hidden_layers=model_config.num_hidden_layers,
+            residual_in_fp32=model_config.residual_in_fp32,
+            layer_norm_epsilon=model_config.rms_norm_eps,
+            hidden_size=model_config.d_model,
+            state_size=model_config.ssm_cfg["d_state"],
+            expand=model_config.ssm_cfg["expand"],
+            conv_kernel=model_config.ssm_cfg["d_conv"],
+            use_bias=model_config.ssm_cfg["bias"],
+            use_conv_bias=model_config.ssm_cfg["conv_bias"],
+            time_step_rank=model_config.ssm_cfg["dt_rank"],
+            time_step_scale=model_config.ssm_cfg["dt_scale"],
+            time_step_min=model_config.ssm_cfg["dt_min"],
+            time_step_max=model_config.ssm_cfg["dt_max"],
+            time_step_init_scheme=model_config.ssm_cfg["dt_init"],
+            time_step_floor=model_config.ssm_cfg["dt_init_floor"],
+        )
+
+    # Initialised HF model
+    with init_on_device_and_dtype(device, dtype):
+        model_hf = MambaForCausalLM._from_config(model_config_hf)
+
+    # Get mapping of Nanotron layer and HF layer
+    hf_to_nanotron = {}
+
+    # Static mappings
+    hf_to_nanotron["backbone.embeddings.weight"] = "token_position_embeddings.pp_block.token_embedding.weight"
+    hf_to_nanotron["backbone.norm_f.weight"] = "final_layer_norm.pp_block.weight"
+    hf_to_nanotron["lm_head.weight"] = "lm_head.pp_block.weight"
+
+    # Dynamic mappings within a loop
+    for i in range(model_config.num_hidden_layers):
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.A_log"] = f"decoder.{i}.pp_block.mixer.A_log"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.D"] = f"decoder.{i}.pp_block.mixer.D"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.in_proj.weight"] = f"decoder.{i}.pp_block.mixer.in_proj.weight"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.conv1d.weight"] = f"decoder.{i}.pp_block.mixer.conv1d.weight"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.conv1d.bias"] = f"decoder.{i}.pp_block.mixer.conv1d.bias"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.x_proj.weight"] = f"decoder.{i}.pp_block.mixer.x_proj.weight"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.x_proj.bias"] = f"decoder.{i}.pp_block.mixer.x_proj.bias"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.dt_proj.weight"] = f"decoder.{i}.pp_block.mixer.dt_proj.weight"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.dt_proj.bias"] = f"decoder.{i}.pp_block.mixer.dt_proj.bias"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.out_proj.weight"] = f"decoder.{i}.pp_block.mixer.out_proj.weight"
+        hf_to_nanotron[f"backbone.layers.{i}.mixer.out_proj.bias"] = f"decoder.{i}.pp_block.mixer.out_proj.bias"
+        hf_to_nanotron[f"backbone.layers.{i}.norm.weight"] = f"decoder.{i}.pp_block.norm.weight"
+
+    def _reverse_interleave_pattern(N):
+        """
+        Compute the reverse of the interleave pattern given by _interleave_pattern.
+        Example:
+        reverse_interleave_pattern(4) -> [0, 2, 1, 3]
+        reverse_interleave_pattern(8) -> [0, 2, 4, 6, 1, 3, 5, 7]
+        """
+        assert N % 2 == 0, "N must be even"
+
+        def __interleave_pattern(N):
+            """
+            interleave_pattern(4) -> [0, 2, 1, 3]
+            interleave_pattern(8) -> [0, 4, 1, 5, 2, 6, 3, 7]
+            """
+            assert N % 2 == 0, "N must be even"
+            pattern = []
+            for i in range(N // 2):
+                pattern.append(i)
+                pattern.append(i + N // 2)
+            return pattern
+
+        interleaved_pattern = __interleave_pattern(N)
+        reverse_pattern = [0] * N
+        for original_index, interleaved_index in enumerate(interleaved_pattern):
+            reverse_pattern[interleaved_index] = original_index
+        return reverse_pattern
+
+    # Loop over the state dict and convert the keys to HF format
+    for module_name_hf, module_hf in model_hf.named_modules():
+        for param_name_hf, param_hf in module_hf.named_parameters(recurse=False):
+            # Get the Nanotron parameter
+            nanotron_key = "model." + hf_to_nanotron[f"{module_name_hf}.{param_name_hf}"]
+            param = model_nanotron_state_dict[nanotron_key]
+
+            if "in_proj" in nanotron_key:
+                # Undo the interleaving weights in Nanotron to make it HF compatible
+                param = param[_reverse_interleave_pattern(param.shape[0]), :]
+
+            with torch.no_grad():
+                param_hf.copy_(param)
+
+    # Save the model
+    model_hf.save_pretrained(save_path)
+    print(f"Model saved to {save_path}")
+
+    # Save the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+    tokenizer.save_pretrained(save_path)
+    print(f"Tokenizer saved to {save_path}")
+
+
+def check_converted_model_generation(save_path: Path):
+    HARCODED_PROMPT = "What is your "
+
+    tokenizer = AutoTokenizer.from_pretrained(save_path)
+    input_ids = tokenizer(HARCODED_PROMPT, return_tensors="pt")["input_ids"]
+    print("Inputs:", tokenizer.batch_decode(input_ids))
+
+    model = MambaForCausalLM.from_pretrained(save_path)
+    out = model.generate(input_ids, max_new_tokens=100)
+    print("Generation (converted): ", tokenizer.batch_decode(out))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert Nanotron weights to HF format")
+    parser.add_argument("--checkpoint_path", type=str, default="mamba-130m")
+    parser.add_argument("--save_path", type=str, default="mamba-hf")
+    args = parser.parse_args()
+
+    save_path = Path(args.save_path)
+    checkpoint_path = Path(args.checkpoint_path)
+
+    # Convert Nanotron model to HF format
+    convert_checkpoint_and_save(checkpoint_path=checkpoint_path, save_path=save_path)
+
+    # check if the conversion was successful by generating some text
+    check_converted_model_generation(save_path=save_path)
diff --git a/examples/mamba/create_config_mamba.py b/examples/mamba/create_config_mamba.py
index 47f214ad..eee8d161 100644
--- a/examples/mamba/create_config_mamba.py
+++ b/examples/mamba/create_config_mamba.py
@@ -1,9 +1,11 @@
 """ Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information."""
 import math
 import os
+import uuid
 
 from config import MambaConfig, MambaInit, MambaModelConfig
 from nanotron.config import (
+    AdamWOptimizerArgs,
     CheckpointsArgs,
     DataArgs,
     DatasetStageArgs,
@@ -19,6 +21,10 @@
 )
 from nanotron.logging import human_format
 
+new_job_id = uuid.uuid4()
+job_id = str(new_job_id)[:8]
+seed = 42
+
 ssm_cfg_dtype = "bfloat16"
 ssm_cfg = {
     "d_state": 16,
@@ -37,7 +43,7 @@
 # https://huggingface.co/state-spaces/mamba-790m/blob/main/config.json
 model_config = MambaModelConfig(
     d_model=1024,
-    num_hidden_layers=48,
+    num_hidden_layers=2,
     vocab_size=50278,
     ssm_cfg=ssm_cfg,
     rms_norm=True,
@@ -88,15 +94,12 @@
 
 seed = 42
 
+
 optimizer = OptimizerArgs(
     zero_stage=0,
     weight_decay=0.01,
     clip_grad=1.0,
     accumulate_grad_in_fp32=True,  # NOTE(fmom): because we are using PP=TP=DP=1
-    adam_eps=1e-08,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    torch_adam_is_fused=True,
     learning_rate_scheduler=LRSchedulerArgs(
         learning_rate=0.0015,
         lr_warmup_steps=30,
@@ -104,8 +107,15 @@
         lr_decay_style="cosine",
         min_decay_lr=0.00015,
     ),
+    optimizer_factory=AdamWOptimizerArgs(
+        adam_eps=1e-08,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        torch_adam_is_fused=True,
+    ),
 )
 
+
 parallelism = ParallelismArgs(
     dp=2,
     pp=2,
@@ -128,6 +138,11 @@
     )
 ]
 
+model = ModelArgs(
+    init_method=MambaInit(initializer_range=0.02, rescale_prenorm_residual=True, n_residuals_per_layer=1),
+    model_config=model_config,
+)
+
 checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
 os.makedirs(checkpoints_path, exist_ok=True)
 
@@ -135,10 +150,7 @@
     general=GeneralArgs(project="test", run="mamba", seed=seed, ignore_sanity_checks=True),
     checkpoints=CheckpointsArgs(checkpoints_path=checkpoints_path, checkpoint_interval=100),
     parallelism=parallelism,
-    model=ModelArgs(
-        init_method=MambaInit(initializer_range=0.02, rescale_prenorm_residual=True, n_residuals_per_layer=1),
-        model_config=model_config,
-    ),
+    model=model,
     tokenizer=TokenizerArgs("gpt2"),
     optimizer=optimizer,
     logging=LoggingArgs(),
diff --git a/examples/mamba/mamba.py b/examples/mamba/mamba.py
index fec2dcf1..88ad85d2 100644
--- a/examples/mamba/mamba.py
+++ b/examples/mamba/mamba.py
@@ -181,7 +181,6 @@ def __init__(
         self.A_log = create_sharded_parameter_from_config(
             parameter=A_log, pg=self.tp_pg, split_config=SplitConfig(split_dim=0)
         )
-        self.A_log._no_weight_decay = True
 
         # D "skip" parameter
         self.D = create_sharded_parameter_from_config(
@@ -189,7 +188,6 @@ def __init__(
             pg=self.tp_pg,
             split_config=SplitConfig(split_dim=0),
         )
-        self.D._no_weight_decay = True
 
         # self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
         self.out_proj = TensorParallelRowLinear(
@@ -228,7 +226,6 @@ def forward(self, hidden_states: Union[torch.Tensor, TensorPointer]):
                 return out
             else:
                 store["seqlen_offset"] += 1
-
         # We do matmul and transpose BLH -> HBL at the same time
         xz = self.in_proj(hidden_states).transpose(1, 2)
         A = -torch.exp(self.A_log.float())  # (d_inner, d_state)
@@ -664,7 +661,7 @@ def get_block_compute_costs(self):
 
     def get_flops_per_sec(self, iteration_time_in_sec, sequence_length, global_batch_size):
         """
-        Get flops per second for a Mamba model. 
+        Get flops per second for a Mamba model.
         Terms such as nonlinearities, biases, and layer normalization are omitted (https://arxiv.org/pdf/2001.08361.pdf)
         """
         # world_size = self.parallel_context.world_pg.size()
@@ -807,6 +804,14 @@ def forward(
             label_mask=label_mask,
         )["loss"]
         return {"loss": loss}
+    
+    def get_named_params_without_weight_decay(self):
+        # get full name with "A_log", "D"
+        named_param_without_weight_decay = []
+        for name, _ in self.model.named_parameters():
+            if "A_log" in name or "D" in name:
+                named_param_without_weight_decay.append(name)
+        return named_param_without_weight_decay
 
     @torch.no_grad()
     def init_model_randomly(self, config):
@@ -917,11 +922,7 @@ def init_model_randomly(self, config):
                     raise ValueError(f"Who the fuck is {param_name}?")
 
             elif isinstance(module, Mamba):
-                # NOTE(fmom): nn.Parameter are initialized in Mamba __init__
-                # In Mamba, only those 3 parameters don't have weight decay.
-                if param_name in ["dt_bias", "A_log", "D"]:
-                    param._no_weight_decay = True
-
+                pass
             else:
                 raise Exception(f"Parameter {full_param_name} was not initialized")
 
diff --git a/examples/mamba/run_generate.py b/examples/mamba/run_generate.py
index f7194668..75271fa9 100644
--- a/examples/mamba/run_generate.py
+++ b/examples/mamba/run_generate.py
@@ -58,9 +58,9 @@
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--ckpt-path", type=Path, required=True, help="Checkpoint path")
-    parser.add_argument("--dp", type=int, default=0)
-    parser.add_argument("--pp", type=int, default=0)
-    parser.add_argument("--tp", type=int, default=0)
+    parser.add_argument("--dp", type=int, default=1)
+    parser.add_argument("--pp", type=int, default=1)
+    parser.add_argument("--tp", type=int, default=1)
     parser.add_argument("--max-new-tokens", type=int, default=128, help="Maximum number of new tokens to generate")
     return parser.parse_args()
 
@@ -77,9 +77,9 @@ def main():
     tokenizer_path = config.tokenizer.tokenizer_name_or_path
 
     parallel_config = ParallelismArgs(
-        dp=args.dp or config.parallelism.dp,
-        pp=args.pp or config.parallelism.pp,
-        tp=args.tp or config.parallelism.tp,
+        dp=args.dp,
+        pp=args.pp,
+        tp=args.tp,
         pp_engine=OneForwardOneBackwardPipelineEngine(),
         tp_mode=TensorParallelLinearMode.ALL_REDUCE,
         tp_linear_async_communication=False,
diff --git a/examples/moe/config_llamoe.py b/examples/moe/config_llamoe.py
index ad1deec2..c1f314ea 100644
--- a/examples/moe/config_llamoe.py
+++ b/examples/moe/config_llamoe.py
@@ -4,6 +4,7 @@
 from typing import Optional
 
 from nanotron.config import (
+    AdamWOptimizerArgs,
     CheckpointsArgs,
     Config,
     DataArgs,
@@ -99,11 +100,13 @@ def __post_init__(self):
     weight_decay=0.01,
     clip_grad=1.0,
     accumulate_grad_in_fp32=False,
-    adam_eps=1e-08,
-    adam_beta1=0.9,
-    adam_beta2=0.95,
-    torch_adam_is_fused=True,
     learning_rate_scheduler=learning_rate,
+    optimizer_factory=AdamWOptimizerArgs(
+        adam_eps=1e-08,
+        adam_beta1=0.9,
+        adam_beta2=0.95,
+        torch_adam_is_fused=True,
+    ),
 )
 
 parallelism = ParallelismArgs(
diff --git a/examples/moe/config_llamoe.yaml b/examples/moe/config_llamoe.yaml
index 1b312129..46dc0534 100644
--- a/examples/moe/config_llamoe.yaml
+++ b/examples/moe/config_llamoe.yaml
@@ -5,18 +5,30 @@ checkpoints:
   resume_checkpoint_path: /fsx/nouamane/projects/nanotron/examples/checkpoints
   save_initial_state: true
 data_stages:
-  - name: General purpose training
-    start_training_step: 1
-    data:
-      dataset:
-        dataset_overwrite_cache: false
-        dataset_processing_num_proc_per_process: 12
-        hf_dataset_config_name: null
-        hf_dataset_or_datasets: roneneldan/TinyStories
-        hf_dataset_splits: train
-        text_column_name: text
-      num_loading_workers: 1
-      seed: 42
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 12
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: roneneldan/TinyStories
+      hf_dataset_splits: train
+      text_column_name: text
+    num_loading_workers: 1
+    seed: 42
+  name: Stable Training Stage
+  start_training_step: 1
+- data:
+    dataset:
+      dataset_overwrite_cache: false
+      dataset_processing_num_proc_per_process: 12
+      hf_dataset_config_name: null
+      hf_dataset_or_datasets: roneneldan/TinyStories
+      hf_dataset_splits: train
+      text_column_name: text
+    num_loading_workers: 1
+    seed: 42
+  name: Annealing Phase
+  start_training_step: 10
 general:
   benchmark_csv_path: null
   consumed_train_samples: null
@@ -60,9 +72,6 @@ model:
     vocab_size: 32000
 optimizer:
   accumulate_grad_in_fp32: false
-  adam_beta1: 0.9
-  adam_beta2: 0.95
-  adam_eps: 1.0e-08
   clip_grad: 1.0
   learning_rate_scheduler:
     learning_rate: 0.0003
@@ -72,7 +81,12 @@ optimizer:
     lr_warmup_steps: 100
     lr_warmup_style: linear
     min_decay_lr: 1.0e-05
-  torch_adam_is_fused: true
+  optimizer_factory:
+    adam_beta1: 0.9
+    adam_beta2: 0.95
+    adam_eps: 1.0e-08
+    name: adamW
+    torch_adam_is_fused: true
   weight_decay: 0.01
   zero_stage: 0
 parallelism:
diff --git a/examples/moe/llamoe.py b/examples/moe/llamoe.py
index fb274ad4..046a4b30 100644
--- a/examples/moe/llamoe.py
+++ b/examples/moe/llamoe.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ PyTorch LLaMa MoE model."""
 import math
-from typing import Dict, Optional, Union
+from typing import Dict, Optional, Union, List
 
 import torch
 from config_llamoe import LlaMoEConfig
@@ -325,7 +325,6 @@ def forward(
             # Double check that we use store only at inference time
             assert key_states.requires_grad is False
             assert value_states.requires_grad is False
-            print("Using store")
             if "position_offsets" in store:
                 old_position_offsets = store["position_offsets"]
                 position_ids = old_position_offsets[:, None] + sequence_mask
@@ -915,7 +914,7 @@ def init_model_randomly(self, config):
             else name
             for name, param in model.named_parameters()
         }, f"Somehow the initialized set of parameters don't match:\n - Expected: { {name for name, _ in model.named_parameters()} }\n - Got: {initialized_parameters}"
-
+    
     def get_block_compute_costs(self):
         """Computes the compute cost of each block in the model so that we can do a better job of load balancing."""
         return self.model.get_block_compute_costs()
diff --git a/examples/moe/moe.py b/examples/moe/moe.py
index 658fab01..ff9fd73b 100644
--- a/examples/moe/moe.py
+++ b/examples/moe/moe.py
@@ -402,7 +402,7 @@ def __init__(self, module, expert_parallel_size: int):
         self.expert_parallel_size = expert_parallel_size
 
     def forward(self, *args, **kwargs):
-        # self.scale_gradients()
+        self.scale_gradients()
         return self.module(*args, **kwargs)
 
     def scale_gradients(self):
diff --git a/examples/moe/requirements.txt b/examples/moe/requirements.txt
index 20b2778d..b32c55b4 100644
--- a/examples/moe/requirements.txt
+++ b/examples/moe/requirements.txt
@@ -1 +1,2 @@
+stanford-stk>=0.0.6
 megablocks==0.5.1
diff --git a/examples/mup/README.md b/examples/mup/README.md
index c86850ca..ed94c1fb 100644
--- a/examples/mup/README.md
+++ b/examples/mup/README.md
@@ -32,3 +32,8 @@ We trained a 350m model with spectral µTransfer and standard parametrization us
 Please check the directory [[./examples/mup/configs]](/examples/mup/configs) for the configurations we used to reproduce the experiments.
 
 ![LLaMA](./assets/llama.png)
+
+
+#### Thoughts
+
+For Spectral MuP, the experiments we used it on MLP only [link] and 300m LLaMA [link] (there are links to the experiment config in the mup readme). However, when we tested it on 1B/8B models iirc, the loss blew up for some reasons. So, we'd recommend they try μTransfer, not spectral μTransfer.
diff --git a/pyproject.toml b/pyproject.toml
index ebb81b8f..e65f37a5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "packaging",
     "safetensors",
     "dacite",
-    "tqdm"
+    "tqdm",
 ]
 
 [tool.setuptools.packages.find]
@@ -47,6 +47,12 @@ fast-modeling = [
     "flash-attn>=2.5.0",
 ]
 
+nanosets = [
+     "transformers",
+     "datasets",
+     "numba",
+]
+
 [build-system]
 requires = [
     "setuptools",
diff --git a/run_generate.py b/run_generate.py
index 05199977..07b96e1e 100644
--- a/run_generate.py
+++ b/run_generate.py
@@ -658,7 +658,7 @@ def evaluate(self, X, y):
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--ckpt-path", type=Path, required=True, help="Checkpoint path")
-    parser.add_argument("--dp", type=int, default=0)
+    parser.add_argument("--dp", type=int, default=1)
     parser.add_argument("--pp", type=int, default=0)
     parser.add_argument("--tp", type=int, default=0)
     parser.add_argument("--max-new-tokens", type=int, default=40, help="Maximum number of new tokens to generate")
@@ -776,113 +776,12 @@ def main():
         tokenizer.padding_side = "left"
         tokenizer.truncation_side = "left"  # TODO @nouamane: do we want this?
         dummy_inputs = [
-            # "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Daniel walked to the living room. Sandra moved to the dining room. John traveled to the kitchen. Daniel journeyed to the hallway. Where is Mary?\nAnswer:",
-            # "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary? Mary is in ",
-            # """
-            # Passage: Ethan walked to the library. Olivia moved to the living room. Sophia headed to the living room. Sophia went to the balcony. James went to the office. Olivia returned to the library. Where is Olivia? Olivia is in the library.
-            # Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Daniel? Daniel is in
-            # """,
-            # """
-            # Passage: The password is 93. Remember it. 93 is the password. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the password? The password is 93.
-            # Passage: The pass key is 24. Remember it. 24 is the pass key. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the pass key? The pass key is 24.
-            # Passage: The gatecode is 312. Remember it. 312 is the gatecode. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. . What is the gatecode? The gatecode is 312.
-            # Passage: The vault key is 4124. Remember it. 4124 is the vault key. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. . What is the vault key? The vault key is 4124.
-            # Passage: The encrypted message is 515. Remember it. 515 is encrypted message. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the encrypted message? The encrypted message is 515.
-            # Passage: The password is 76. Remember it. 76 is the password. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the password? The password is
-            # """
-            # """
-            # Passage: The password is 93. Remember it. 93 is the password. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the password? The password is 93.
-            # Passage: The password is 76. Remember it. 76 is the password. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the password? The password is
-            # """,
-            """
-            Passage: The special magic Singapore number is 144. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. Based on the content of the passage, Question: What is the special magic Singapore number? Answer: The special magic Singapore number is 144.
-
-            Passage: The special magic Netherland number is 931. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green.Question: What is the special magic Netherland number? Answer: The special magic Netherland number is
-            """
-            # "Passage: The password is 94. Remember it. 94 is the password. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the password? The password is "
-            # "This is a math lesson. Answer the question. What is the result of 1 + 1? The result is ",
-            # "This is a math lesson. Answer the question. What is the result of 1 + 1? The result is ",
-            # "def fib(n)",
-            # "def fib(x)",
-            # "This film was probably inspired by Godzilla, ",
-            # "This film was probably inspired by Godzilla, ",
-            # "Paris is the capital of ",
-            # "Paris is the capital of ",
-            # END_PASSKEY_EXTACT_32K_TOKENS,
-            # END_PASSKEY_EXTACT_32K_TOKENS,
-            # PASSKEY_NINETY_PERCENT,
-            # "The pass key is 24. Remember it. 24 is the pass key. The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the pass key? The pass key is ",
-            # "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there. The grass is green. The sky is blue. The pass key is 24. Remember it. 24 is the pass key. What is the pass key? The pass key is ",
-            # "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there. The grass is green. The sky is blue. The best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The best thing to do in San Francisco is ",
-            # NEEDLE_TEXT_16K_CTX_AND_0_DEPTH,
-            # NEEDLE_TEXT_1K_CTX_AND_0_DEPTH,
-            # NEEDLE_TEXT_16K_CTX_AND_0_DEPTH,
-            # MATH_TEXT_256_TOKENS,
-            # "The pass key is 24. Remember it. 24 is the pass key. The grass is green. The sky is blue. What is the pass key? The pass key is ",
-            # "The pass key is 25. Remember it. 25 is the pass key. The grass is green. The sky is blue. What is the pass key? The pass key is ",
-            # "The pass key is 41. Remember it. 41 is the pass key. The grass is green. The sky is blue. What is the pass key? The pass key is ",
-            # "The pass key is 214. Remember it. 214 is the pass key. The grass is green. The sky is blue. What is the pass key? The pass key is ",
-            # "Remmeber the pass key. The pass key is 24. Remember it. 24 is the pass key. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue.The grass is green. The sky is blue. The grass is green. What is the pass key? The pass key is ",
-            # "Remmeber the pass key. The pass key is 214. Remember it. 214 is the pass key. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue.The grass is green. The sky is blue. The grass is green. What is the pass key? The pass key is ",
-            # "The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. Remmeber the pass key. The pass key is 214. Remember it. 214 is the pass key....... What is the pass key? The pass key is ",
-            # "What is the pass key? The pass key is 214. Remmeber the pass key. The pass key is 214. Remember it. 214 is the pass key. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue.The grass is green. The sky is blue. The grass is green. What is the pass key? The pass key is ",
-            # "The grass is green. "
-            # "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there. The secret color is red. Remember it. The secret color is red. The streets are alive with people. Horns honk and engines roar. The urban jungle never sleeps.\nThe mountains stand tall and proud. What is the secret color? The secret color is",
-            # "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about what is the pass key later on. The pass key is 54. Remember it. The pass key is 54. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the pass key? The pass key is ",
-            # "The best thing to do in Tokyo is climb a tree in Shinjuku Gyoen. The grass is green. The sky is blue. What is the best thing to do in Tokyo? The best thing to do in Tokyo is "
-            # "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about what is the best thing to do in Tokyo later on? The best thing to do in Tokyo is climb a tree in Shinjuku Gyoen. The grass is green. The sky is blue. The sun is yellow. Here we go. The grass is green. The sky is blue. The sun is yellow. Here we go. The grass is green. The sky is blue. The sun is yellow. Here we go. The grass is green. The sky is blue. The sun is yellow. Here we go. What is the best thing to do in Tokyo? The best thing to do in Tokyo is "
-            # "Sara's favorite snack is Hot Cheetos. The grass is green. The sky is blue. What is Sara's favorite snack? Sara's favorite snack is ",
-            # "What is Sara's favorite snack? Sara's favorite snack is Hot Cheetos. Sara's favorite snack is Hot Cheetos. The grass is green. The sky is blue. The grass is green. The sky is blue. What is Sara's favorite snack? Sara's favorite snack is ",
-            # "The topic is the study of the social sciences. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. What is the topic? She studies the topic about ",
-            # "Sara studies at the University of California, Berkeley. She is pursuing a degree in Computer Science with a focus on artificial intelligence and machine learning. Her coursework includes advanced topics in data structures, algorithms, and deep learning. Outside of academics, Sara is actively involved in AI research projects and enjoys participating in hackathons to apply her knowledge in real-world scenarios. Where does Sara study at? Sara studies at the "
-            # "There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about what is the pass key later on, Lisa works at Google. She is a software engineer focusing on cloud computing and infrastructure. Her responsibilities include developing scalable cloud solutions, optimizing server performance, and ensuring data security. Outside of work, Lisa is actively involved in open-source projects and enjoys contributing to tech communities. Based on the previous information, where does Lisa work? Lisa works at "
-            # PASSKEY_256_CTX_AND_0_DEPTH,
-            # 200 tokens
-            # "The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue",
-            # "The pass key is 24. Remember it. 24 is the pass key. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue.The grass is green. The sky is blue The grass is green. The sky is blue.The grass is green. The sky is blue The grass is green. The sky is blue.The grass is green. The sky is blue The grass is green. The sky is blue.The grass is green. The sky is blue The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue. The grass is green. The sky is blue.The grass is green. The sky is blue The grass is green. The sky is blue.The grass is green. The sky is blue The grass is green. The sky is blue. What is the pass key? The pass key is ",
-            # FINETUNE,
-            # "1234",
-            # "1234",
-            # "Hi there, I'm from the USA, my name is ",
-            # "Hi there, I'm from the USA, my name is ",
-            # NOTE: training data
-            # "InterruptEnumeration of all the interrupts. This enum is seldom used in application or library crates. It is present primarily for",
-            # "InterruptEnumeration of all the interrupts. This enum is seldom used in application or library crates. It is present primarily for",
-            # "If the parser encounters a syntax error, an error event value incl. a description and input position will be produced (but no JS error will be thrown) and the entire ",
-            # "If the parser encounters a syntax error, an error event value incl. a description and input position will be produced (but no JS error will be thrown) and the entire ",
-            # "This work introduces an efficient method to scale Transformer-based ",
-            # "This work introduces an efficient method to scale Transformer-based Large Language Models (LLMs) to infinitely long inputs with bounded memory and computation. A key component in our proposed approach is a new attention technique dubbed Infini-attention. The Infini-attention incorporates a compressive memory into the vanilla attention mechanism and builds in both masked local attention and long-term linear attention mechanisms in a single Transformer block. We demonstrate the effectiveness of our approach on long-context language modeling benchmarks, 1M sequence length passkey context block retrieval and 500K length book summarization tasks with 1B and 8B LLMs. Our approach introduces minimal bounded memory parameters and enables fast streaming inference for "
-            # "This work introduces an efficient method to scale Transformer-based "
-            # NOTE: last training data
-            # "Effectively managing student enquiries lies at the heart of successful real estate and investment education. Through leveraging diverse resources such as FAQs, online enquiry ",
-            # TRAINING_DATA_4K,
-            # TRAINING_DATA_4K,
-            # PASSKEY_1024_CTX_AND_0_DEPTH,
-            # PASSKEY_1024_CTX_AND_0_DEPTH
-            # PASSKEY_1024_CTX_AND_90_DEPTH,
-            # PASSKEY_1024_CTX_AND_90_DEPTH,
-            # MATH_TEXT_256_TOKENS,
-            # "Neuroscientists are the cartographers of the brain’s diverse domains and territories — the features and activities that define them, the roads and highways that connect ",
-            # "Professor Cathy Drennan introduces this series of lectures about basic chemical principles. She describes her path to becoming a chemist and reveals her first impression of the discipline of chemistry. ",
-            # "Professor Cathy Drennan introduces this series of lectures about basic chemical principles. She describes her path to becoming a chemist and reveals her first impression of the discipline of chemistry. ",
-            # "Professor Cathy Drennan introduces this series of lectures about basic chemical principles. She describes her path to becoming a chemist and reveals her first impression of the discipline of chemistry. Goals for students of this material are presented as well as some examples about how real world problems can be solved through the applications of chemical principles. Teaching assistants for the course are introduced. What is the professor's name? The professor's name is Cathy ",
-            # "Gravity is most accurately described by the general theory of relativity, proposed by Albert Einstein in",
-            # "The Standard Model of particle physics is the theory describing three of the four known fundamental forces in the universe and classifying all known elementary particles. It was developed in ",
-            # "In a direct democracy, the people have the direct authority to deliberate and decide legislation. In a representative democracy, the people choose",
-            #             """
-            #             Neural networks comprise of layers/modules that perform operations on data. The torch.nn namespace provides all the building blocks you need to build your own neural network. Every module in PyTorch subclasses the nn.Module. A neural network is a module itself that consists of other modules (layers). This nested structure allows for building and managing complex architectures easily.
-            # In the following sections, we’ll build a neural network to classify images in the FashionMNIST dataset.
-            # import os
-            # import torch
-            # from torch import nn
-            #             """,
-            # NEEDLE_16K_CTX_AND_0_DEPTH,
-            # NEEDLE_16K_CTX_AND_100_DEPTH
-            # "_grad"
-            # "Neel Nanda helped show how neural networks that had grokked modular arithmetic transformed the numbers using complicated mathematics. ",
-            # "Earth is the third planet from the Sun and the only astronomical object known to harbor life. This is enabled by Earth being an ocean world, the only one in the Solar System sustaining liquid surface water. Almost all of Earth's water is contained ",
-            # AI_TEXT_16K,
-            # "This past Monday, about a dozen engineers and executives at data science and AI company Databricks gathered in conference rooms connected via Zoom to learn if they had succeeded in building a top artificial intelligence language model. The team had spent months, and about $\$ 10$ million, training DBRX, a large language model similar in design to the one behind OpenAl's ChatGPT. But they wouldn't know how powerful their creation was until results came back from the final tests of its abilities."
+            "The future of AI is",
+            # "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
+            "def fib(n)",
+            # 'Here is an extract from a webpage: "Have you ever experienced heel pain after a heavy physical activity, or even right after a long period of standing? If you regard this as something usual and normal, then think again. Miscalled as heel pain, plantar fasciitis causes these frequent mild pains experienced in the soles of the feet. It is the inflammation and enlargement the plantar fascia tissue that is located in the heels of the feet, stretching to the base of the toes. This tissue is responsible for absorbing shock in the feet and for supporting the arches. It also plays a vital role in foot movements during walking and standing. Many factors such as excessive walking, standing, and running trigger heel pain and plantar fasciitis. A sudden increase in intensity of activities, increase in weight, and abrupt change of footwear also cause the swelling of the ligament. Non-supportive footwear lacking arch cushions and improper and worn out running or training can also lead to the problem. It is also most evident among those". Write an extensive and detailed course unit suitable for a textbook targeted at college students, related to the given extract, within the context of "Medicine". Do not just list concepts, but develop each one in detail before moving to the next, as we prioritize depth of understanding and comprehensive exploration of the subject matter over breadth. Focus on: - Rigor: Ensure in-depth coverage of the concepts/sections. - Engagement: Write with an academic, professional and engaging tone that captivates interest. - Application: Incorporate specific, practical examples, such as proofs in calculus or critical dates and figures in history. Do not include a title or an introduction, simply write the content without headlines and introductory phrases. Do not use images.',
+            # "Advancements in technology will lead to",
+            # "Tomorrow's world is shaped by",
         ]
 
         outputs = decode_text(
diff --git a/run_train.py b/run_train.py
index b516be77..b33231f4 100644
--- a/run_train.py
+++ b/run_train.py
@@ -10,24 +10,24 @@
 import argparse
 from typing import Dict, cast
 
+import numpy as np
 from nanotron import logging
-from nanotron.config import (
-    DataArgs,
-    DatasetStageArgs,
-    PretrainDatasetsArgs,
-)
+from nanotron.config import DataArgs, DatasetStageArgs, NanosetDatasetsArgs, PretrainDatasetsArgs
+from nanotron.data.dataloader_builder import build_nanoset_dataloader
 from nanotron.dataloader import (
     clm_process,
     dummy_infinite_data_generator,
     get_datasets,
     get_train_dataloader,
 )
+from nanotron.helpers import (
+    compute_remain_train_steps_of_a_data_stage_from_ckp,
+    get_consumed_train_samples_of_a_data_stage_from_ckp,
+)
 from nanotron.logging import log_rank
 from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
 from nanotron.trainer import DistributedTrainer
-from nanotron.utils import (
-    main_rank_first,
-)
+from nanotron.utils import main_rank_first
 from torch.utils.data import DataLoader
 
 try:
@@ -41,8 +41,21 @@
 logger = logging.get_logger(__name__)
 
 
-def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs):
-    """Returns a dataloader for training."""
+def get_dataloader_from_data_stage(
+    trainer: DistributedTrainer,
+    data: DataArgs,
+    consumed_train_samples: int,
+    num_remaining_train_steps: int,
+):
+    """
+    Returns a dataloader for a given data stage.
+
+    data: The data configuration for the current stage.
+    consumed_train_samples: The number of samples consumed by the model in the this stage (each stage starts from zero).
+    num_remaining_train_steps: The number of remaining training steps for this stage.
+    """
+    assert consumed_train_samples >= 0, "consumed_train_samples should be greater than 0"
+    assert num_remaining_train_steps >= 0, "num_remaining_train_steps should be greater than 0"
 
     # First, we need to know which ranks to feed the dataloader to
     input_pp_rank, output_pp_rank = get_input_output_pp_ranks(model=trainer.model)
@@ -87,6 +100,11 @@ def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs):
             tokenizer.pad_token = tokenizer.eos_token
             tokenizer.padding_side = "left"
 
+            # Check that tokenizer's vocab size is smaller than the model's vocab size
+            assert (
+                tokenizer.vocab_size <= trainer.model_config.vocab_size
+            ), f"Tokenizer's vocab size ({tokenizer.vocab_size}) is larger than the model's vocab size ({trainer.model_config.vocab_size})"
+
             # We apply the Causal Language Modeling preprocessing
             train_dataset = clm_process(
                 raw_dataset=raw_dataset,
@@ -105,30 +123,55 @@ def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs):
                 input_pp_rank=input_pp_rank,
                 output_pp_rank=output_pp_rank,
                 micro_batch_size=trainer.micro_batch_size,
-                consumed_train_samples=trainer.consumed_train_samples,
+                consumed_train_samples=consumed_train_samples,
                 dataloader_num_workers=data.num_loading_workers,
                 seed_worker=data.seed,
                 dataloader_drop_last=True,
             )
+
             # Check if we have enough samples for train_steps
             total_tokens_dataset = len(dataloader.dataset) * trainer.sequence_length
             num_tokens_needed_for_training = (
-                (trainer.config.tokens.train_steps - trainer.start_iteration_step)
-                * trainer.global_batch_size
-                * trainer.sequence_length
+                num_remaining_train_steps * trainer.global_batch_size * trainer.sequence_length
             )
+            assert num_tokens_needed_for_training <= total_tokens_dataset, (
+                f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
+                f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.iteration_step}"
+            )
+
+    # Case 3: Nanosets
+    elif isinstance(data.dataset, NanosetDatasetsArgs):
+        # Get tokenizer cardinality
+        tokenizer = AutoTokenizer.from_pretrained(trainer.config.tokenizer.tokenizer_name_or_path)
+        token_dtype = np.int32 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else np.uint16
+        del tokenizer
+        # Create Nanoset
+        from nanotron.data.nanoset import Nanoset
 
-            if num_tokens_needed_for_training <= total_tokens_dataset:
-                print("intentionally skipping this step for repeat 33 epochs")
-                print(
-                    f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
-                    f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.start_iteration_step}"
-                )
-
-            # assert num_tokens_needed_for_training <= total_tokens_dataset, (
-            #     f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
-            #     f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.start_iteration_step}"
-            # )
+        with main_rank_first(trainer.parallel_context.world_pg):
+            train_dataset = Nanoset(
+                dataset_paths=data.dataset.dataset_path,
+                dataset_weights=data.dataset.dataset_weights,
+                sequence_length=trainer.sequence_length,
+                token_dtype=token_dtype,
+                train_split_num_samples=trainer.config.tokens.train_steps * trainer.global_batch_size,
+                random_seed=data.seed,
+            )
+
+        # Prepare dataloader
+        train_dataloader = build_nanoset_dataloader(
+            train_dataset,
+            trainer.sequence_length,
+            parallel_context=trainer.parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=trainer.micro_batch_size,
+            consumed_train_samples=consumed_train_samples,
+            dataloader_num_workers=data.num_loading_workers,
+            dataloader_drop_last=True,
+        )
+
+        return train_dataloader
     else:
         raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}")
 
@@ -136,16 +179,41 @@ def get_dataloader_from_data_stage(trainer: DistributedTrainer, data: DataArgs):
 
 
 def get_dataloader(trainer: DistributedTrainer) -> Dict[str, DataLoader]:
-    sorted_stages = sorted(trainer.config.data_stages, key=lambda stage: stage.start_training_step)
     dataloaders = {}
-    for idx, stage in enumerate(sorted_stages):
+
+    for stage_idx, stage in enumerate(trainer.config.data_stages):
         # NOTE: we only create the dataloader for the first stage,
         # then we lazy initialize the dataloader for the other stages
         stage = cast(DatasetStageArgs, stage)
+        consumed_train_samples = get_consumed_train_samples_of_a_data_stage_from_ckp(stage, trainer.metadata)
+        assert (
+            consumed_train_samples is not None
+        ), f"Cannot find consumed_train_samples for stage {stage.start_training_step} in the checkpoint"
+
+        num_remaining_train_steps = compute_remain_train_steps_of_a_data_stage_from_ckp(
+            stage, trainer.config, trainer.metadata
+        )
+        log_rank(
+            f"[Training Plan] Stage {stage.name} has {num_remaining_train_steps} remaining training steps and has consumed {consumed_train_samples} samples",
+            logger=logger,
+            level=logging.INFO,
+            rank=0,
+        )
+
         dataloader = (
-            get_dataloader_from_data_stage(trainer, stage.data)
-            if idx == 0
-            else lambda stage=stage: get_dataloader_from_data_stage(trainer, stage.data)
+            get_dataloader_from_data_stage(
+                trainer,
+                stage.data,
+                consumed_train_samples=consumed_train_samples,
+                num_remaining_train_steps=num_remaining_train_steps,
+            )
+            if stage_idx == 0
+            else lambda stage=stage: get_dataloader_from_data_stage(
+                trainer,
+                stage.data,
+                consumed_train_samples=consumed_train_samples,
+                num_remaining_train_steps=num_remaining_train_steps,
+            )
         )
         dataloaders[stage.name] = dataloader
     return dataloaders
diff --git a/scripts/fix_checkpoint_bad_naming.py b/scripts/fix_checkpoint_bad_naming.py
new file mode 100644
index 00000000..1bd4e36e
--- /dev/null
+++ b/scripts/fix_checkpoint_bad_naming.py
@@ -0,0 +1,51 @@
+"""Fixes the problem where '{type.value}_{suffix_name}.safetensors' was duplicated in checkpoint files
+
+For example this script will change the following:
+```
+checkpoints/10/model/model/decoder/0/pp_block/attn/o_proj/model_model_weight.safetensors_pp-rank-0-of-1_tp-rank-0-of-2.safetensors
+to
+checkpoints/10/model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-2.safetensors
+```
+
+Example Usage:
+
+python scripts/fix_checkpoint_bad_naming.py /fsx/nouamane/projects/nanotron/checkpoints/10
+"""
+
+import argparse
+import os
+import re
+from pathlib import Path
+
+
+def update_checkpoint(checkpoint_dir: str):
+    print(f"Updating checkpoint in {checkpoint_dir}")
+    for root, _, files in os.walk(checkpoint_dir):
+        for file in files:
+            if file.endswith(".safetensors"):
+                # r'(?<=model)_(model)' means match the string '_model' that is preceded by 'model'
+                if len(re.findall(r"(?<=model)_(model)", file)) == 0:
+                    continue
+                # we remove second _model
+                new_file = re.sub(r"(?<=model)_(model)", "", file)
+                # we would have "model_weight.safetensors_pp-rank-0-of-1_tp-rank-0-of-2.safetensors"
+
+                # let's assert we have two matches of ".safetensors"
+                assert len(re.findall(r".safetensors", new_file)) == 2
+                # then we remove first match
+                new_file = re.sub(r".safetensors", "", new_file, count=1)
+                # so that we get "model_weight_pp-rank-0-of-1_tp-rank-0-of-2.safetensors"
+
+                print(f"Renaming {file} to {new_file}")
+                os.rename(os.path.join(root, file), os.path.join(root, new_file))
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Update checkpoint from 1.3 to 1.4")
+    parser.add_argument("checkpoint_dir", type=Path, help="Path to the checkpoint directory")
+    args = parser.parse_args()
+    update_checkpoint(args.checkpoint_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/nanotron/config/config.py b/src/nanotron/config/config.py
index b4f957de..9e47c520 100644
--- a/src/nanotron/config/config.py
+++ b/src/nanotron/config/config.py
@@ -91,11 +91,27 @@ def __post_init__(self):
             self.hf_dataset_splits = "train"
 
 
+@dataclass
+class NanosetDatasetsArgs:
+    dataset_path: Union[str, dict, List[str]]
+
+    def __post_init__(self):
+        if isinstance(self.dataset_path, str):  # Case 1: 1 Dataset file
+            self.dataset_path = [self.dataset_path]
+            self.dataset_weights = [1]
+        elif isinstance(self.dataset_path, List):  # Case 2: > 1 Dataset file
+            self.dataset_weights = None  # Set to None so we consume all the samples randomly
+        elif isinstance(self.dataset_path, dict):  # Case 3: dict with > 1 dataset_path and weights
+            tmp_dataset_path = self.dataset_path.copy()
+            self.dataset_path = list(tmp_dataset_path.keys())
+            self.dataset_weights = list(tmp_dataset_path.values())
+
+
 @dataclass
 class DataArgs:
     """Arguments related to the data and data files processing"""
 
-    dataset: Optional[PretrainDatasetsArgs]
+    dataset: Union[PretrainDatasetsArgs, NanosetDatasetsArgs]
     seed: Optional[int]
     num_loading_workers: Optional[int] = 1
 
@@ -231,7 +247,7 @@ class LRSchedulerArgs:
 
     lr_warmup_steps: number of steps to warmup the learning rate
     lr_warmup_style: linear or constant
-    lr_decay_style: linear or cosine
+    lr_decay_style: linear, cosine or 1-sqrt
     min_decay_lr: minimum learning rate after decay
     lr_decay_steps: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps
     lr_decay_starting_step: optional number of steps to decay the learning rate otherwise will default to train_steps - lr_warmup_steps
@@ -254,28 +270,37 @@ def __post_init__(self):
             self.lr_warmup_style = "linear"
         if self.lr_decay_style is None:
             self.lr_decay_style = "linear"
-        if self.lr_decay_style not in ["linear", "cosine"]:
+        if self.lr_decay_style not in ["linear", "cosine", "1-sqrt"]:
             raise ValueError(
-                f"lr_decay_style should be a string selected in ['linear', 'cosine'] and not {self.lr_decay_style}"
+                f"lr_decay_style should be a string selected in ['linear', 'cosine', '1-sqrt'] and not {self.lr_decay_style}"
             )
         if self.min_decay_lr is None:
             self.min_decay_lr = self.learning_rate
 
 
+@dataclass
+class SGDOptimizerArgs:
+    name: str = "sgd"
+
+
+@dataclass
+class AdamWOptimizerArgs:
+    adam_eps: float
+    adam_beta1: float
+    adam_beta2: float
+    torch_adam_is_fused: bool
+    name: str = "adamW"
+
+
 @dataclass
 class OptimizerArgs:
     """Arguments related to the optimizer and learning rate"""
 
+    optimizer_factory: Union[SGDOptimizerArgs, AdamWOptimizerArgs]
     zero_stage: int
     weight_decay: float
     clip_grad: Optional[float]
-
     accumulate_grad_in_fp32: bool
-
-    adam_eps: float
-    adam_beta1: float
-    adam_beta2: float
-    torch_adam_is_fused: bool
     learning_rate_scheduler: LRSchedulerArgs
 
 
@@ -344,6 +369,7 @@ def __post_init__(self):
             )
 
         if self.data_stages is not None:
+            self.data_stages = sorted(self.data_stages, key=lambda stage: stage.start_training_step)
             names = [stage.name for stage in self.data_stages]
             training_steps = [stage.start_training_step for stage in self.data_stages]
             assert any(
@@ -359,6 +385,12 @@ def __post_init__(self):
                         f"Each stage should have unique starting training step, please change the starting training step for stage {stage.name}"
                     )
 
+            # NOTE: must order the stages by start_training_step from lowest to highest
+            assert all(
+                self.data_stages[i].start_training_step < self.data_stages[i + 1].start_training_step
+                for i in range(len(self.data_stages) - 1)
+            ), "The stages are not sorted by start_training_step in increasing order"
+
         # # if lighteval, we need tokenizer to be defined
         # if self.checkpoints.lighteval is not None:
         #     assert self.tokenizer.tokenizer_name_or_path is not None
diff --git a/src/nanotron/config/parallelism_config.py b/src/nanotron/config/parallelism_config.py
index 5912425b..321ee045 100644
--- a/src/nanotron/config/parallelism_config.py
+++ b/src/nanotron/config/parallelism_config.py
@@ -23,6 +23,7 @@ class ParallelismArgs:
         pp_engine: Pipeline engine to use between "1f1b" and "afab"
         tp_mode: TP mode to use between "all_reduce" and "reduce_scatter": all_reduce is normal, reduce_scatter activate sequence parallelism
         tp_linear_async_communication: Whether to use async communication in TP linear layers
+        recompute_layer: Whether to recompute each Transformer layer to save memory.
     """
 
     dp: int
@@ -31,6 +32,7 @@ class ParallelismArgs:
     pp_engine: Optional[PipelineEngine] = None
     tp_mode: Optional[TensorParallelLinearMode] = None
     tp_linear_async_communication: Optional[bool] = None
+    recompute_layer: bool = False
 
     expert_parallel_size: int = 1
 
diff --git a/src/nanotron/constants.py b/src/nanotron/constants.py
index 7782043b..580bd99d 100644
--- a/src/nanotron/constants.py
+++ b/src/nanotron/constants.py
@@ -1,30 +1,12 @@
 import platform
-from typing import Optional
 
 from packaging.version import Version, parse
 
-CHECKPOINT_VERSION = Version("1.2")
+CHECKPOINT_VERSION = Version("1.4")
 
 PY_VERSION = parse(platform.python_version())
 
-# OPTIMIZER_CONFIG_FILE_NAME = "optimizer_config.json"
-OPTIMIZER_CKP_PATH = "{}/optimizer/optimizer_config.json"
+#### FOR SERIALIZATION ####
 
-LR_SCHEDULER_CKP_PATH = "{}/lr_scheduler"
-METADATA_CKP_PATH = "{}/checkpoint_metadata.json"
-
-NEEDLE = None
-
-GLOBAL_STEP: Optional[int] = None
-LOG_STATE_INTERVAL = 2000
-IS_RANK_TO_MONITOR = None
-CONFIG = None
-
-TRAINING_CONFIG = None
-
-
-DEBUG_PATH = "./debug/nn_states_with_bs_2_and_transpose_qkv/acts/"
-
-MONITOR_STATE_PATH = "/fsx/phuc/projects/nanotron/debug/runs"
-
-BALANCE_FACTOR_STD = {}
+CHECKPOINT_FILE_NAME = "checkpoint_metadata.json"
+MODEL_CONFIG_FILE_NAME = "model_config.json"
diff --git a/src/nanotron/data/dataloader_builder.py b/src/nanotron/data/dataloader_builder.py
new file mode 100644
index 00000000..4719c476
--- /dev/null
+++ b/src/nanotron/data/dataloader_builder.py
@@ -0,0 +1,64 @@
+import nanotron.distributed as dist
+from nanotron import logging
+from nanotron.dataloader import (
+    DataCollatorForCLM,
+    EmptyInfiniteDataset,
+    get_dataloader_worker_init,
+    get_sampler,
+)
+from nanotron.parallel import ParallelContext
+from torch.utils.data import DataLoader
+
+logger = logging.get_logger(__name__)
+
+
+def build_nanoset_dataloader(
+    dataset,
+    sequence_length: int,
+    parallel_context: ParallelContext,
+    input_pp_rank: int,
+    output_pp_rank: int,
+    micro_batch_size: int,
+    dataloader_num_workers: int,
+    consumed_train_samples: int = 0,
+    dataloader_drop_last: bool = True,
+    dataloader_pin_memory: bool = True,
+) -> DataLoader:
+
+    # Case of ranks not requiring data. We give them a dummy dataset, then the collator will do his job
+    if dist.get_rank(parallel_context.pp_pg) not in [input_pp_rank, output_pp_rank]:
+        dataset_length = len(dataset)
+        dataset = EmptyInfiniteDataset(length=dataset_length)
+        # No need to spawn a lot of workers, we can just use main
+        dataloader_num_workers = 0
+
+    data_collator = DataCollatorForCLM(
+        sequence_length=sequence_length,
+        input_pp_rank=input_pp_rank,
+        output_pp_rank=output_pp_rank,
+        parallel_context=parallel_context,
+    )
+
+    # Compute size and rank of dataloader workers
+    dp_ranks_size = parallel_context.dp_pg.size()
+    dp_rank = parallel_context.dp_pg.rank()
+
+    sampler = get_sampler(
+        train_dataset=dataset,
+        dl_ranks_size=dp_ranks_size,
+        dl_rank=dp_rank,
+        drop_last=dataloader_drop_last,
+        consumed_train_samples=consumed_train_samples,
+        shuffle=False,
+    )
+
+    return DataLoader(
+        dataset,
+        batch_size=micro_batch_size,
+        sampler=sampler,
+        collate_fn=data_collator,
+        drop_last=dataloader_drop_last,
+        num_workers=dataloader_num_workers,
+        pin_memory=dataloader_pin_memory,
+        worker_init_fn=get_dataloader_worker_init(dp_rank=dp_rank),
+    )
diff --git a/src/nanotron/data/nanoset.py b/src/nanotron/data/nanoset.py
new file mode 100644
index 00000000..9d62b33d
--- /dev/null
+++ b/src/nanotron/data/nanoset.py
@@ -0,0 +1,186 @@
+from typing import Dict, List, Tuple, Union
+
+import numpy as np
+import torch
+from nanotron import logging
+from nanotron.data.utils import count_dataset_indexes, normalize
+from nanotron.logging import log_rank
+from numba import jit
+
+logger = logging.get_logger(__name__)
+
+
+class Nanoset(torch.utils.data.Dataset):
+    """
+    The Nanoset dataset
+
+    Args:
+        dataset_paths (List[str]): List of paths to tokenized datasets
+        dataset_weights (List[float]): List with the weights for weighted datasets. If None, consume all samples from all datasets without weighting. Weights are normalized in __init__
+        sequence_length (int): Sequence length of the built samples
+        token_dtype (Union[np.uint16, np.int32]): dtype of the tokens stored in the processed dataset files. np.uin16 for vocab sizes < 65535, np.int32 otherwise
+        train_split_num_samples (int): Number of samples the dataset needs. It's the training steps * global batch size
+    """
+
+    def __init__(
+        self,
+        dataset_paths: List[str],
+        dataset_weights: Union[List[float], None],
+        sequence_length: int,
+        token_dtype: Union[np.uint16, np.int32],
+        train_split_num_samples: int,
+        random_seed: int = 1234,
+    ) -> None:
+
+        # Init
+        self.dataset_paths = dataset_paths
+        self.dataset_weights = dataset_weights
+        self.sequence_length = sequence_length
+        self.token_dtype = token_dtype
+        self.train_split_num_samples = train_split_num_samples
+        self.random_seed = random_seed
+
+        # Build Nanoset Index
+        ## To build the index we need the length of each dataset
+        self.dataset_lengths = []
+        for dataset_path in self.dataset_paths:
+            self.dataset_buffer_mmap = np.memmap(dataset_path, mode="r", order="C", dtype=self.token_dtype)
+            self.dataset_buffer = memoryview(self.dataset_buffer_mmap)
+            dataset_number_of_tokens = int(len(self.dataset_buffer))
+            number_of_samples = int(
+                (dataset_number_of_tokens - 1) / sequence_length
+            )  # Discard last sample if length < sequence_length
+            self.dataset_lengths.append(number_of_samples)
+        ## Set dataset weights
+        if (
+            self.dataset_weights is None
+        ):  # Case of training with > 1 datasets without weighting them: Consume both datasets entirely on each epoch
+            self.dataset_weights = normalize(self.dataset_lengths)
+        else:
+            self.dataset_weights = normalize(dataset_weights)
+        ## Build dataset index and dataset sample index
+        self.dataset_index, self.dataset_sample_index = self.build_nanoset_index()
+
+        self.print_nanoset_info()
+
+    def __len__(self) -> int:
+        """
+        Returns:
+            int: The number of samples of the Nanoset
+        """
+
+        return len(self.dataset_index)
+
+    def __getitem__(self, idx: int) -> Dict[str, np.ndarray]:
+        """
+        Returns sequence_length + 1 tokens from the memmap dataset
+
+        Args:
+            idx (int): The index into the dataset
+
+        Returns:
+            Dict[str, numpy.ndarray]: The input ids wrapped in a dictionary
+        """
+
+        dataset = self.dataset_index[idx]
+        dataset_sample = self.dataset_sample_index[idx]
+
+        # Rebuild the memmap in every access to free memory
+        # https://stackoverflow.com/a/61472122
+        self.dataset_buffer_mmap = np.memmap(self.dataset_paths[dataset], mode="r", order="C", dtype=self.token_dtype)
+        self.dataset_buffer = memoryview(self.dataset_buffer_mmap)
+
+        # uint16 -> 2 bytes per token, int32 -> 4 bytes per token
+        offset = dataset_sample * self.sequence_length * (np.iinfo(self.token_dtype).bits / 8)
+        input_ids_tokens = np.frombuffer(
+            self.dataset_buffer, dtype=self.token_dtype, count=(self.sequence_length + 1), offset=int(offset)
+        )
+
+        # Return tokens as np.int32 as Torch can't handle uint16
+        return {"input_ids": input_ids_tokens.astype(np.int32)}
+
+    def build_nanoset_index(self) -> np.ndarray:
+        """
+        Build dataset index and dataset sample index
+        """
+        # Compute samples per epoch and number of epochs
+        samples_per_epoch = sum(self.dataset_lengths)
+        num_epochs = int(self.train_split_num_samples / samples_per_epoch) + 1
+        # Build the dataset indexes for 1 epoch
+        dataset_index, dataset_sample_index = build_nanoset_index_helper(
+            n_samples=samples_per_epoch, weights=self.dataset_weights, dataset_sizes=self.dataset_lengths
+        )
+        # Shuffle the indexes the same way
+        numpy_random_state = np.random.RandomState(self.random_seed)
+        numpy_random_state.shuffle(dataset_index)
+        numpy_random_state = np.random.RandomState(self.random_seed)
+        numpy_random_state.shuffle(dataset_sample_index)
+        # Concatenate num_epochs the shuffled indexes
+        dataset_index = np.concatenate([dataset_index for _ in range(num_epochs)])
+        dataset_sample_index = np.concatenate([dataset_sample_index for _ in range(num_epochs)])
+        # Just keep the necessary samples
+        dataset_index = dataset_index[: self.train_split_num_samples]
+        dataset_sample_index = dataset_sample_index[: self.train_split_num_samples]
+
+        return dataset_index, dataset_sample_index
+
+    def __del__(self) -> None:
+        """
+        Clean up Nanoset
+        """
+
+        if hasattr(self, "dataset_buffer_mmap"):
+            self.dataset_buffer_mmap._mmap.close()
+        del self.dataset_buffer_mmap
+
+    def print_nanoset_info(self):
+
+        log_rank(f"> Total number of samples: {len(self)}", logger=logger, level=logging.INFO, rank=0)
+        log_rank(
+            f"> Total number of tokens: {len(self) * self.sequence_length}", logger=logger, level=logging.INFO, rank=0
+        )
+
+        # Print samples from each dataset + weight
+        dataset_sample_count = count_dataset_indexes(self.dataset_index, len(self.dataset_paths))
+        for index, sample_count in enumerate(dataset_sample_count):
+            log_rank(
+                f">   Total number of samples from the {self.dataset_paths[index].rsplit('/', 1)[-1]} dataset: {sample_count} ({round(normalize(dataset_sample_count).tolist()[index], 2)})",
+                logger=logger,
+                level=logging.INFO,
+                rank=0,
+            )
+
+
+@jit(nopython=True, cache=True)
+def build_nanoset_index_helper(
+    n_samples: int, weights: np.ndarray, dataset_sizes: List[int]
+) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Given multiple datasets and a weighting array, build samples indexes
+    such that it follows those weights
+    """
+    # Create empty arrays for dataset indices and dataset sample indices
+    dataset_index = np.empty((n_samples,), dtype="uint")
+    dataset_sample_index = np.empty((n_samples,), dtype="long")  # Supports dataset with up to 2**64 samples
+
+    # Initialize buffer for number of samples used for each dataset
+    current_samples = np.zeros((len(weights),), dtype="long")
+
+    # Iterate over all samples
+    for sample_idx in range(n_samples):
+
+        # Convert sample index to float for comparison against weights
+        sample_idx_float = max(sample_idx, 1.0)
+
+        # Find the dataset with the highest error
+        errors = weights * sample_idx_float - current_samples
+        max_error_index = np.argmax(errors)
+
+        # Assign the dataset index and update the sample index
+        dataset_index[sample_idx] = max_error_index
+        dataset_sample_index[sample_idx] = current_samples[max_error_index] % dataset_sizes[max_error_index]
+
+        # Update the total samples for the selected dataset
+        current_samples[max_error_index] += 1
+
+    return dataset_index, dataset_sample_index
diff --git a/src/nanotron/data/utils.py b/src/nanotron/data/utils.py
new file mode 100644
index 00000000..09198187
--- /dev/null
+++ b/src/nanotron/data/utils.py
@@ -0,0 +1,28 @@
+from typing import List
+
+import numpy as np
+
+
+def normalize(weights: List[float]) -> List[np.array]:
+    """
+    Normalize elements of a list
+
+    Args:
+        weights (List[float]): The weights
+
+    Returns:
+        List[numpy.array]: The normalized weights
+    """
+    w = np.array(weights, dtype=np.float64)
+    w_sum = np.sum(w)
+    w = w / w_sum
+    return w
+
+
+def count_dataset_indexes(dataset_idx: np.ndarray, n_datasets: int):
+    counts = []
+
+    for dataset in range(n_datasets):
+        counts.append(np.count_nonzero(dataset_idx == dataset))
+
+    return counts
diff --git a/src/nanotron/dataloader.py b/src/nanotron/dataloader.py
index b451ec66..61f73557 100644
--- a/src/nanotron/dataloader.py
+++ b/src/nanotron/dataloader.py
@@ -85,7 +85,7 @@ def sanity_check_dataloader(
 # Adapted from h4/src/h4/data/loading.py
 def get_datasets(
     hf_dataset_or_datasets: Union[dict, str],
-    hf_dataset_config_name: Optional[str] = None,
+    hf_dataset_config_name: str,
     splits: Optional[Union[List[str], str]] = ["train", "test"],
 ) -> "DatasetDict":
     """
@@ -117,8 +117,6 @@ def get_datasets(
         for split in splits:
             raw_datasets[split] = load_dataset(
                 hf_dataset_or_datasets,
-                # NOTE: weird shit, I can't pass config_name=config_name
-                # have to pass it as positional arguments!!
                 hf_dataset_config_name,
                 split=split,
             )
@@ -351,10 +349,10 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni
         ]:
             assert all(len(example) == 0 for example in examples)
             return {
-                "input_ids": TensorPointer(self.input_pp_rank),
-                "input_mask": TensorPointer(self.input_pp_rank),
-                "label_ids": TensorPointer(self.output_pp_rank),
-                "label_mask": TensorPointer(self.output_pp_rank),
+                "input_ids": TensorPointer(group_rank=self.input_pp_rank),
+                "input_mask": TensorPointer(group_rank=self.input_pp_rank),
+                "label_ids": TensorPointer(group_rank=self.output_pp_rank),
+                "label_mask": TensorPointer(group_rank=self.output_pp_rank),
             }
 
         # Make sure we load only what's necessary, ie we only load a `input_ids` column.
@@ -402,15 +400,16 @@ def __call__(self, examples: List[Dict[str, List[np.ndarray]]]) -> Dict[str, Uni
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/47e1676255e5dd86b9541f734cd4f4bdcbb50f4a/src/transformers/trainer.py#L763-L835
-def _get_train_sampler(
+def get_sampler(
     dl_ranks_size: int,
     dl_rank: int,
-    train_dataset: "Dataset",
-    seed: int,
-    use_loop_to_round_batch_size: bool,
+    train_dataset: Union["Dataset", torch.utils.data.Dataset],
     consumed_train_samples: int,
+    seed: int = 42,
+    use_loop_to_round_batch_size: bool = False,
     micro_batch_size: Optional[int] = None,
     drop_last: Optional[bool] = True,
+    shuffle: bool = True,
 ) -> Optional[torch.utils.data.Sampler]:
     """returns sampler that restricts data loading to a subset of the dataset proper to the DP rank"""
 
@@ -430,7 +429,7 @@ def _get_train_sampler(
         )
     else:
         sampler = DistributedSampler(
-            train_dataset, num_replicas=dl_ranks_size, rank=dl_rank, seed=seed, drop_last=drop_last
+            train_dataset, num_replicas=dl_ranks_size, rank=dl_rank, seed=seed, drop_last=drop_last, shuffle=shuffle
         )
 
     if consumed_train_samples > 0:
@@ -495,7 +494,7 @@ def get_train_dataloader(
     # TODO @nouamanetazi: Remove unused columns: https://github.com/huggingface/transformers/blob/47e1676255e5dd86b9541f734cd4f4bdcbb50f4a/src/transformers/trainer.py#L852
     # TODO @nouamanetazi: Support torch.utils.data.IterableDataset: https://github.com/huggingface/transformers/blob/47e1676255e5dd86b9541f734cd4f4bdcbb50f4a/src/transformers/trainer.py#L855-L872
 
-    train_sampler = _get_train_sampler(
+    train_sampler = get_sampler(
         dl_rank=dp_rank,
         dl_ranks_size=dp_ranks_size,
         train_dataset=train_dataset,
diff --git a/src/nanotron/generation/decode.py b/src/nanotron/generation/decode.py
index f8caafcf..f388675c 100644
--- a/src/nanotron/generation/decode.py
+++ b/src/nanotron/generation/decode.py
@@ -268,6 +268,14 @@ def decode_text(
 
     p2p = model.p2p
 
+    # replicate input for n_samples times when using TOP_P or TOP_K samplers, in order to get diverse results
+    if generation_config and generation_config.n_samples:
+        if sampler_type != SamplerType.TOP_P and sampler_type != SamplerType.TOP_K:
+            raise ValueError("Only support n_samples for TOP_P and TOP_K sampler")
+        input_iter = [
+            GenerationInput(text=input.text) for input in input_iter for _ in range(generation_config.n_samples)
+        ]
+
     # That's annoying but I need this as soon as there's a change communication "cross"
     pipeline_state = PipelineEvalBatchState()
     with attach_pipeline_state_to_model(model=model, pipeline_state=pipeline_state):
diff --git a/src/nanotron/helpers.py b/src/nanotron/helpers.py
index 60867c51..0397622a 100644
--- a/src/nanotron/helpers.py
+++ b/src/nanotron/helpers.py
@@ -13,13 +13,12 @@
 import torch
 from torch import nn
 from torch.nn.parallel import DistributedDataParallel
-from torch.optim import AdamW
 from torch.optim.lr_scheduler import LambdaLR
 from torch.profiler import ProfilerActivity, profile, tensorboard_trace_handler
 
 from nanotron import distributed as dist
 from nanotron import logging
-from nanotron.config import Config, LRSchedulerArgs, OptimizerArgs, ParallelismArgs
+from nanotron.config import Config, DatasetStageArgs, LRSchedulerArgs, OptimizerArgs, ParallelismArgs
 from nanotron.distributed import ProcessGroup
 from nanotron.logging import LogItem, log_rank
 from nanotron.models.base import NanotronModel
@@ -43,6 +42,7 @@
     get_synced_random_state,
 )
 from nanotron.scaling.parametrization import LearningRateForSP, LearningRateForSpectralMup, ParametrizationMethod
+from nanotron.serialize.metadata import TrainingMetadata
 
 logger = logging.get_logger(__name__)
 
@@ -152,6 +152,10 @@ def _get_lr_lambda_in_training(
                 * (lr_decay_steps - (current_step - lr_decay_starting_step))
                 / lr_decay_steps
             )
+        elif lr_scheduler_args.lr_decay_style == "1-sqrt":
+            lmbda = lr_scheduler_args.min_decay_lr + (initial_lr - lr_scheduler_args.min_decay_lr) * (
+                1 - math.sqrt((current_step - lr_decay_starting_step) / lr_decay_steps)
+            )
         else:
             raise ValueError(f"Unknown decay style {lr_scheduler_args.lr_decay_style}")
 
@@ -192,6 +196,41 @@ def get_lr_lambda_for_param_group(lr: float):
     return lr_scheduler
 
 
+def get_custom_weight_decay_for_named_parameters(
+    named_parameters: Iterable[Tuple[str, torch.Tensor]],
+    model: NanotronModel,
+    module_id_to_prefix: Dict[int, str],
+    weight_decay: float,
+) -> List[Dict[str, Any]]:
+    """
+    Apply weight decay to all parameters except the ones that are in the named_param_without_weight_decay list.
+    """
+
+    named_param_groups_with_custom_weight_decay = []
+
+    exclude_named_params = model.get_named_params_without_weight_decay()
+
+    for name, param in named_parameters:
+        if param.is_tied:
+            param.get_tied_info().get_full_name_from_module_id_to_prefix(module_id_to_prefix=module_id_to_prefix)
+        else:
+            pass
+
+        if any(name.endswith(substring) for substring in exclude_named_params):
+            named_param_groups_with_custom_weight_decay.append({"named_params": [(name, param)], "weight_decay": 0.0})
+        else:
+            named_param_groups_with_custom_weight_decay.append(
+                {"named_params": [(name, param)], "weight_decay": weight_decay}
+            )
+
+    log_rank(
+        f"[Optimizer Building] Creating {len(named_param_groups_with_custom_weight_decay)} param groups with custom weight decay",
+        logger=logger,
+        level=logging.DEBUG,
+    )
+    return named_param_groups_with_custom_weight_decay
+
+
 def get_custom_lr_for_named_parameters(
     parametrization_method: ParametrizationMethod,
     lr: float,
@@ -255,6 +294,31 @@ def get_custom_lr_for_named_parameters(
     return named_param_groups_with_custom_lr
 
 
+def merge_named_param_groups(
+    named_param_groups_with_lr: List[Dict[str, Any]],
+    named_param_groups_with_weight_decay: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+
+    assert len(named_param_groups_with_lr) == len(
+        named_param_groups_with_weight_decay
+    ), "Named param groups don't match in length"
+
+    named_param_groups = []
+    for group_with_lr, group_with_weight_decay in zip(
+        named_param_groups_with_lr, named_param_groups_with_weight_decay
+    ):
+        assert group_with_lr["named_params"] == group_with_weight_decay["named_params"]
+        named_param_groups.append(
+            {
+                "named_params": group_with_lr["named_params"],
+                "lr": group_with_lr["lr"],
+                "weight_decay": group_with_weight_decay["weight_decay"],
+            }
+        )
+
+    return named_param_groups
+
+
 def init_optimizer_and_grad_accumulator(
     parametrization_method: ParametrizationMethod,
     model: nn.Module,
@@ -269,28 +333,53 @@ def init_optimizer_and_grad_accumulator(
     module_id_to_prefix[id(unwrapped_model)] = ""
 
     named_parameters = list(unwrapped_model.get_named_params_with_correct_tied())
-    named_param_groups = get_custom_lr_for_named_parameters(
+
+    named_param_groups_with_lr = get_custom_lr_for_named_parameters(
         parametrization_method=parametrization_method,
         named_parameters=named_parameters,
         model=unwrapped_model,
         lr=optimizer_args.learning_rate_scheduler.learning_rate,
     )
+    named_param_groups_with_weight_decay = get_custom_weight_decay_for_named_parameters(
+        named_parameters=named_parameters,
+        model=unwrapped_model,
+        module_id_to_prefix=module_id_to_prefix,
+        weight_decay=optimizer_args.weight_decay,
+    )
 
-    assert 1 == 1
+    named_param_groups = merge_named_param_groups(named_param_groups_with_lr, named_param_groups_with_weight_decay)
 
     # Basic optimizer builder
     def basic_optimizer_builder(named_param_groups):
+        optimizer = None
+
+        if optimizer_args.optimizer_factory.name == "adamW":
+
+            def optimizer(param_groups):
+                return torch.optim.AdamW(
+                    param_groups,
+                    lr=optimizer_args.learning_rate_scheduler.learning_rate,
+                    weight_decay=optimizer_args.weight_decay,
+                    eps=optimizer_args.optimizer_factory.adam_eps,
+                    betas=(optimizer_args.optimizer_factory.adam_beta1, optimizer_args.optimizer_factory.adam_beta2),
+                    fused=optimizer_args.optimizer_factory.torch_adam_is_fused,
+                )
+
+        elif optimizer_args.optimizer_factory.name == "sgd":
+
+            def optimizer(param_groups):
+                return torch.optim.SGD(
+                    param_groups,
+                    lr=optimizer_args.learning_rate_scheduler.learning_rate,
+                    weight_decay=optimizer_args.weight_decay,
+                )
+
+        else:
+            raise ValueError(f"Optimizer {optimizer_args.optimizer_factory.name} is not supported")
+
         return NamedOptimizer(
             named_params_or_groups=named_param_groups,
-            optimizer_builder=lambda param_groups: AdamW(  # pylint: disable=E0601
-                param_groups,
-                # NOTE: don't apply global weight if balance_factor_weight_decay is set
-                # weight_decay=optimizer_args.weight_decay if constants.CONFIG.infini_attention.balance_factor_weight_decay is None else None,
-                lr=optimizer_args.learning_rate_scheduler.learning_rate,
-                eps=optimizer_args.adam_eps,
-                betas=(optimizer_args.adam_beta1, optimizer_args.adam_beta2),
-                fused=optimizer_args.torch_adam_is_fused,
-            ),
+            optimizer_builder=optimizer,
         )
 
     optimizer_builder = basic_optimizer_builder
@@ -612,3 +701,39 @@ def log_throughput(
 
     if dist.get_rank(parallel_context.world_pg) == 0:
         write_to_csv(config.general.benchmark_csv_path, table_log, model_tflops, slurm_job_id)
+
+
+def compute_remain_train_steps_of_a_data_stage_from_ckp(
+    stage: DatasetStageArgs, config: Config, metadata: TrainingMetadata
+) -> int:
+    def is_last_stage():
+        sorted_stages = sorted(config.data_stages, key=lambda x: x.start_training_step)
+        return sorted_stages[-1].start_training_step == stage.start_training_step
+
+    def is_resume_from_training():
+        return metadata.last_train_step > 0
+
+    if is_last_stage() is True:
+        total_train_steps = config.tokens.train_steps
+    else:
+        next_stage = next((s for s in config.data_stages if s.start_training_step > stage.start_training_step), None)
+        total_train_steps = next_stage.start_training_step
+
+    if metadata.last_train_step > stage.start_training_step:
+        # NOTE: if the last_train_step is larger than the start_training_step of the current stage,
+        # it means that the training has already passed this stage
+        # so there is no remaining steps
+        return 0
+    else:
+        last_train_steps = metadata.last_train_step if is_resume_from_training() else stage.start_training_step
+        return total_train_steps - last_train_steps
+
+
+def get_consumed_train_samples_of_a_data_stage_from_ckp(
+    stage: DatasetStageArgs, metadata: TrainingMetadata
+) -> Optional[int]:
+    start_training_step = stage.start_training_step
+    return next(
+        (s.consumed_train_samples for s in metadata.data_stages if s.start_training_step == start_training_step),
+        None,
+    )
diff --git a/src/nanotron/models/base.py b/src/nanotron/models/base.py
index e9de2377..14ac6908 100644
--- a/src/nanotron/models/base.py
+++ b/src/nanotron/models/base.py
@@ -65,13 +65,16 @@ def tie_custom_params(self) -> None:
         """Tie custom parameters. For example for MQA marks kv heads as tied."""
         pass
 
-    @staticmethod
-    def get_embeddings_lm_head_tied_names() -> list[str]:
+    def get_embeddings_lm_head_tied_names(self) -> list[str]:
         """Returns the names of the embeddings and lm_head weights that are tied together. Returns empty list if not tied.
 
         Example for GPT2 model: ["model.token_position_embeddings.pp_block.token_embedding.weight", "model.lm_head.pp_block.weight"]
         """
         return []
+    
+    def get_named_params_without_weight_decay(self) -> List[str]:
+        """Return a list of named parameters that should not have weight decay applied to them."""
+        return []
 
     def before_tbi_sanity_checks(self) -> None:
         pass
diff --git a/src/nanotron/models/llama.py b/src/nanotron/models/llama.py
index a00dd12c..924abeb7 100644
--- a/src/nanotron/models/llama.py
+++ b/src/nanotron/models/llama.py
@@ -14,10 +14,11 @@
 # limitations under the License.
 """PyTorch LLaMa model."""
 
-from typing import Dict, Optional, Union
+from typing import Dict, List, Optional, Union
 
 import torch
 from torch import nn
+from torch.utils.checkpoint import CheckpointFunction
 
 from nanotron import constants, logging
 from nanotron import distributed as dist
@@ -1212,7 +1213,9 @@ def __init__(
         self.post_attention_layernorm = TritonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.mlp = MLP(config=config, parallel_config=parallel_config, tp_pg=tp_pg)
 
-    def forward(
+        self.recompute_layer = parallel_config.recompute_layer
+
+    def _core_forward(
         self,
         hidden_states: Union[torch.Tensor, TensorPointer],
         sequence_mask: Union[torch.Tensor, TensorPointer],
@@ -1267,6 +1270,26 @@ def forward(
 
         hidden_states = hidden_states + residual
 
+        return hidden_states, output["sequence_mask"]
+
+    def _checkpointed_forward(
+        self,
+        hidden_states: torch.Tensor,
+        sequence_mask: torch.Tensor,
+    ) -> List[torch.Tensor]:
+        return CheckpointFunction.apply(self._core_forward, True, hidden_states, sequence_mask)
+
+    def forward(
+        self,
+        hidden_states: Union[torch.Tensor, TensorPointer],
+        sequence_mask: Union[torch.Tensor, TensorPointer],
+    ) -> Dict[str, Union[torch.Tensor, TensorPointer]]:
+
+        if self.recompute_layer and not isinstance(hidden_states, TensorPointer):
+            hidden_states, sequence_mask = self._checkpointed_forward(hidden_states, sequence_mask)
+        else:
+            hidden_states, sequence_mask = self._core_forward(hidden_states, sequence_mask)
+
         return {
             "hidden_states": hidden_states,
             "sequence_mask": output["sequence_mask"],
diff --git a/src/nanotron/models/starcoder2.py b/src/nanotron/models/starcoder2.py
index 67f87c0f..7100351d 100644
--- a/src/nanotron/models/starcoder2.py
+++ b/src/nanotron/models/starcoder2.py
@@ -1555,8 +1555,7 @@ def init_model_randomly(self, config):
             for name, param in model.named_parameters()
         }, f"Somehow the initialized set of parameters don't match:\n - Expected: { {name for name, _ in model.named_parameters()} }\n - Got: {initialized_parameters}"
 
-    @staticmethod
-    def get_embeddings_lm_head_tied_names() -> List[str]:
+    def get_embeddings_lm_head_tied_names(self) -> List[str]:
         return [
             "model.token_embeddings.pp_block.token_embedding.weight",
             "model.lm_head.pp_block.weight",
diff --git a/src/nanotron/serialize/main.py b/src/nanotron/serialize/main.py
index f8e621cd..439db5e2 100644
--- a/src/nanotron/serialize/main.py
+++ b/src/nanotron/serialize/main.py
@@ -1,17 +1,25 @@
 from pathlib import Path
-from typing import Optional
+from typing import Optional, cast
 
 import torch
 from torch import nn
 from torch.nn.parallel import DistributedDataParallel
+from torch.optim.lr_scheduler import LambdaLR
 
 from nanotron import distributed as dist
 from nanotron import logging
 from nanotron import optim as optim
 from nanotron.config import Config
+from nanotron.constants import MODEL_CONFIG_FILE_NAME
+from nanotron.distributed import get_global_rank
 from nanotron.logging import log_rank
 from nanotron.parallel import ParallelContext
-from nanotron.serialize.metadata import CheckpointMetadata, load_meta, save_meta
+from nanotron.parallel.parameters import NanotronParameter
+from nanotron.sanity_checks import (
+    assert_tensor_synced_across_pg,
+    check_optim_state_in_sync,
+)
+from nanotron.serialize.metadata import CheckpointMetadata, TrainingMetadata, load_meta, save_meta
 from nanotron.serialize.optimizer import (
     load_lr_scheduler,
     load_optimizer,
@@ -45,16 +53,15 @@ def save(
     optimizer: optim.BaseOptimizer,
     lr_scheduler: torch.optim.lr_scheduler.LRScheduler,
     parallel_context: ParallelContext,
+    training_metadata: TrainingMetadata,
     root_folder: Path,
     should_save_config: bool = True,
     should_save_model: bool = True,
     should_save_optimizer: bool = True,
     should_save_lr_scheduler: bool = True,
-    checkpoint_metadata: dict = None,
     sanity_checks: bool = True,
 ) -> None:
-    if checkpoint_metadata is None:
-        checkpoint_metadata = {}
+    assert isinstance(training_metadata, TrainingMetadata)
 
     try:
         if should_save_config:
@@ -92,6 +99,11 @@ def save(
         raise e
     try:
         if should_save_lr_scheduler:
+            lr_scheduler = cast(LambdaLR, lr_scheduler)
+            assert len(lr_scheduler.lr_lambdas) == len(
+                optimizer.param_groups
+            ), "The number of lambdas functions in the scheduler should be equal to the number of parameter groups in the optimizer."
+
             save_lr_scheduler(
                 lr_scheduler=lr_scheduler,
                 parallel_context=parallel_context,
@@ -106,7 +118,7 @@ def save(
         )
         raise e
 
-    save_meta(root_folder=root_folder, parallel_context=parallel_context, checkpoint_metadata=checkpoint_metadata)
+    save_meta(root_folder=root_folder, parallel_context=parallel_context, training_metadata=training_metadata)
 
     # TODO @thomas21: sanity check, not sure whether that needs to happen at testing or now (depends how much it costs)
     ###
@@ -181,14 +193,13 @@ def save(
     #                 group=group,
     #             )
 
-    #             torch.testing.assert_close(
-    #                 tensor,
-    #                 reference_tensor,
-    #                 atol=0,
-    #                 rtol=0,
-    #                 msg=lambda msg: f"tensor at {current_state_dict['names'][index]} doesn't match with our reference. Optimizer key: {name}\nCur: {tensor}\nRef: {reference_tensor}\n{msg}",
-    #             )
-    #     ###
+                torch.testing.assert_close(
+                    tensor,
+                    reference_tensor,
+                    atol=0,
+                    rtol=0,
+                    msg=lambda msg: f"tensor at {current_state_dict['names'][index]} doesn't match with our reference. Optimizer key: {name}\nCur: {tensor}\nRef: {reference_tensor}\n{msg}",
+                )
 
     dist.barrier(parallel_context.world_pg)
 
@@ -250,7 +261,7 @@ def parse_ckpt_path(config: Config) -> Optional[Path]:
             load_from_candidate = int(fi.read())
         checkpoint_path = config.checkpoints.resume_checkpoint_path / str(load_from_candidate)
 
-    elif (config.checkpoints.resume_checkpoint_path / "model_config.json").exists():
+    elif (config.checkpoints.resume_checkpoint_path / MODEL_CONFIG_FILE_NAME).exists():
         # we assume that the checkpoint path is a path to a checkpoint
         checkpoint_path = config.checkpoints.resume_checkpoint_path
 
diff --git a/src/nanotron/serialize/metadata.py b/src/nanotron/serialize/metadata.py
index 0953a522..0d8708f9 100644
--- a/src/nanotron/serialize/metadata.py
+++ b/src/nanotron/serialize/metadata.py
@@ -1,7 +1,7 @@
 import dataclasses
 import json
 from pathlib import Path
-from typing import Any, Callable, ClassVar, Dict, List, Tuple, Type, Union
+from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple, Type, Union
 
 import dacite
 import torch
@@ -9,18 +9,62 @@
 from packaging.version import Version
 
 from nanotron import distributed as dist
-from nanotron.constants import CHECKPOINT_VERSION
+from nanotron.constants import CHECKPOINT_FILE_NAME, CHECKPOINT_VERSION
 from nanotron.parallel import ParallelContext
 from nanotron.parallel.parameters import SlicesPair
 
 
+@dataclasses.dataclass
+class DataStageMetadata:
+    """
+    consumed_train_samples: The number of samples consumed by the model in the this stage (each stage starts from zero).
+    last_train_step: The last training step across all stages.
+
+    # NOTE: we should allow people to change the name of the data stages in the config file.
+    # but not the start_training_step, because it could
+    """
+
+    name: str
+    start_training_step: int
+    consumed_train_samples: int
+
+
+@dataclasses.dataclass
+class TrainingMetadata:
+    """
+    consumed_train_samples: The number of samples consumed globally, across all stages.
+    last_train_step: The last training step across all stages.
+    last_stage_idx: The index of the last stage that was trained.
+    data_stages: The metadata for each stage.
+    """
+
+    consumed_train_samples: int
+    last_train_step: int
+
+    # TODO(xrsrke): make this not optional, once we entirely remove
+    # the old checkpoint version
+    last_stage_idx: Optional[int] = None
+    data_stages: Optional[List[DataStageMetadata]] = None
+
+    def __post_init__(self):
+        # NOTE: this is a sanity check after loading a trained checkpoint
+        total_consumed_samples_across_stages = sum(stage.consumed_train_samples for stage in self.data_stages)
+        assert (
+            self.consumed_train_samples == total_consumed_samples_across_stages
+        ), "Mismatch between the total consumed samples and the sum of consumed samples across stages! Something went wrong in the training."
+
+        # TODO(xrsrke): remove this once we entirely remove non-data-stage training
+        if self.last_stage_idx is not None:
+            assert self.data_stages is not None, "data_stages should not be None if last_stage_idx is not None"
+
+
 @dataclasses.dataclass
 class CheckpointMetadata:
     version: Version
     tp: int
     dp: int
-    # Anything users want to store
-    metas: Dict
+    metas: TrainingMetadata
+    custom_metas: Optional[Dict[str, Any]] = None
 
 
 @dataclasses.dataclass
@@ -81,7 +125,9 @@ def to_list(list_: Union[List, Tuple], type_hooks: Dict[Type, Callable[[Any], An
     return list_.__class__((process_type(elt, type_hooks=type_hooks) for elt in list_))
 
 
-def save_meta(parallel_context: ParallelContext, root_folder: Path, checkpoint_metadata: dict):
+def save_meta(parallel_context: ParallelContext, root_folder: Path, training_metadata: TrainingMetadata):
+    assert isinstance(training_metadata, TrainingMetadata)
+
     if dist.get_rank(parallel_context.world_pg) != 0:
         return
 
@@ -90,18 +136,18 @@ def save_meta(parallel_context: ParallelContext, root_folder: Path, checkpoint_m
         version=CHECKPOINT_VERSION,
         tp=parallel_context.tp_pg.size(),
         dp=parallel_context.dp_pg.size(),
-        metas=checkpoint_metadata,
+        metas=training_metadata,
     )
 
     # There are some types that require manual casting in order to work correctly.
     processed_metadata = process_type(dataclasses.asdict(checkpoint_metadata), type_hooks={Version: lambda x: str(x)})
 
-    with open(root_folder / "checkpoint_metadata.json", mode="w") as fo:
+    with open(root_folder / CHECKPOINT_FILE_NAME, mode="w") as fo:
         json.dump(processed_metadata, fo, indent=2, sort_keys=True)
 
 
 def load_meta(parallel_context: ParallelContext, root_folder: Path) -> CheckpointMetadata:
-    with open(root_folder / "checkpoint_metadata.json", mode="r") as fi:
+    with open(root_folder / CHECKPOINT_FILE_NAME, mode="r") as fi:
         checkpoint_metadata = json.load(fi)
         checkpoint_metadata = from_dict(
             data_class=CheckpointMetadata,
diff --git a/src/nanotron/serialize/utils.py b/src/nanotron/serialize/utils.py
index 0e9f0416..f9b43429 100644
--- a/src/nanotron/serialize/utils.py
+++ b/src/nanotron/serialize/utils.py
@@ -54,13 +54,13 @@ def get_path(
     # NOTE: quick fix from https://huggingface.slack.com/archives/C065PTETH8S/p1713881041551769?thread_ts=1713876968.072339&cid=C065PTETH8S
     if exp_tp_pp_rank_and_size:
         # We always show pp_rank and tp_rank if `exp_tp_pp_rank_and_size` is provided
-        # We only show exp_rank if tensor is exp_sharded and exp_size > 1
         (exp_rank, exp_size), (tp_rank, tp_size), (pp_rank, pp_size) = exp_tp_pp_rank_and_size
         if not is_expert_sharded or exp_size == 1:
             suffix_name = (
                 f"{type.value}_{suffix_name}_pp-rank-{pp_rank}-of-{pp_size}_tp-rank-{tp_rank}-of-{tp_size}.safetensors"
             )
         else:
+            # We only show exp_rank if tensor is exp_sharded and exp_size > 1
             suffix_name = f"{type.value}_{suffix_name}_pp-rank-{pp_rank}-of-{pp_size}_tp-rank-{tp_rank}-of-{tp_size}_exp-rank-{exp_rank}-of-{exp_size}.safetensors"
     else:
         suffix_name = f"{type.value}_{suffix_name}.safetensors"  # <- HERE
diff --git a/src/nanotron/serialize/weights.py b/src/nanotron/serialize/weights.py
index 7543b2fb..7bae9ba5 100644
--- a/src/nanotron/serialize/weights.py
+++ b/src/nanotron/serialize/weights.py
@@ -4,7 +4,6 @@
 import dacite
 import torch
 from packaging.version import Version
-from safetensors import SafetensorError
 from safetensors.torch import safe_open, save_file
 from torch import nn
 from tqdm import tqdm
@@ -97,11 +96,6 @@ def save_weights(model: nn.Module, parallel_context: ParallelContext, root_folde
             path.parent.mkdir(exist_ok=True, parents=True)
             try:
                 tensors = {"data": param_or_buffer}
-
-                # Mamba has some parameters that should not be weight decayed
-                if hasattr(model.get_parameter(name), "_no_weight_decay"):
-                    tensors.update({"_no_weight_decay": torch.tensor(model.get_parameter(name)._no_weight_decay)})
-
                 save_file(tensors=tensors, filename=path, metadata=metadata)
             except Exception as e:
                 log_rank(
@@ -260,6 +254,7 @@ def load_weights(
                 exp_tp_pp_rank_and_size = get_exp_tp_pp_rank_and_size_from(
                     world_rank=get_global_rank(group=group, group_rank=group_rank), parallel_context=parallel_context
                 )
+                # TODO @nouamane: do we consider exp_size=1 expert_sharded?
                 is_expert_sharded = sharded_info.is_expert_sharded(parallel_context)
             else:
                 exp_tp_pp_rank_and_size = None
@@ -278,12 +273,6 @@ def load_weights(
                     # TODO @thomasw21: Choose only a slice if we switch the TP topology
                     param_or_buffer[:] = fi.get_tensor("data")
 
-                    # Only Mamba params has this attribute
-                    try:
-                        param._no_weight_decay = fi.get_tensor("_no_weight_decay")
-                    except SafetensorError:
-                        pass
-
             elif not path.parent.exists():
                 if strict is False:
                     log_rank(
@@ -319,7 +308,10 @@ def load_weights(
                         )
                         continue
                     else:
-                        raise ValueError(f"Could not find any shards in {path.parent}")
+                        raise ValueError(
+                            f"Could not find any shards {ObjectType.MODEL.value}_{suffix}*.safetensors in {path.parent}."
+                            f"If you notice `.safetensors` in the middle of the name of some of the checkpoints files. You need to run `scripts/fix_checkpoint_bad_naming.py`."
+                        )
 
                 if checkpoint_version is None:
                     checkpoint_version = get_checkpoint_version(
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index 57be1da5..c8577665 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -34,10 +34,12 @@
     SpectralMupInit,
     get_config_from_file,
 )
-from nanotron.constants import LR_SCHEDULER_CKP_PATH, METADATA_CKP_PATH, OPTIMIZER_CKP_PATH
+from nanotron.constants import LR_SCHEDULER_CKP_PATH, METADATA_CKP_PATH, MODEL_CONFIG_FILE_NAME, OPTIMIZER_CKP_PATH
 from nanotron.dataloader import sanity_check_dataloader
 from nanotron.helpers import (
     _vocab_size_with_padding,
+    compute_remain_train_steps_of_a_data_stage_from_ckp,
+    get_consumed_train_samples_of_a_data_stage_from_ckp,
     get_profiler,
     init_optimizer_and_grad_accumulator,
     init_random_states,
@@ -89,6 +91,7 @@
     save,
     save_random_states,
 )
+from nanotron.serialize.metadata import DataStageMetadata, TrainingMetadata
 from nanotron.serialize.optimizer import load_optimizer
 
 logger = logging.get_logger(__name__)
@@ -178,10 +181,11 @@ def __init__(
         #     else ParametrizationMethod.SPECTRAL_MUP
         # )
 
+        # TODO: find a better way to handle this
         parametrization_method = (
-            ParametrizationMethod.SPECTRAL_MUP
-            if isinstance(self.config.model.init_method, SpectralMupInit)
-            else ParametrizationMethod.STANDARD
+            ParametrizationMethod.STANDARD
+            if isinstance(self.config.model.init_method, RandomInit)
+            else ParametrizationMethod.SPECTRAL_MUP
         )
 
         # Init optimizer
@@ -256,12 +260,13 @@ def __init__(
             checkpoint_metadata = load_meta(
                 parallel_context=self.parallel_context, root_folder=self.init_checkpoint_path
             )
+            assert isinstance(checkpoint_metadata.metas, TrainingMetadata)
             log_rank(str(checkpoint_metadata), logger=logger, level=logging.INFO, rank=0)
-            self.start_iteration_step = checkpoint_metadata.metas["last_train_step"]
-            self.consumed_train_samples = checkpoint_metadata.metas["consumed_train_samples"]
+            self.metadata: TrainingMetadata = checkpoint_metadata.metas
+            # NOTE: we should not change data stages
             assert (
-                self.config.tokens.train_steps > self.start_iteration_step
-            ), f"Loaded checkpoint has already trained {self.start_iteration_step} batches, you need to specify a higher `config.tokens.train_steps`"
+                self.config.tokens.train_steps > self.metadata.last_train_step
+            ), f"Loaded checkpoint has already trained {self.metadata.last_train_step} batches, you need to specify a higher `config.tokens.train_steps`"
         else:
             if self.init_checkpoint_path is not None and not is_ckp_meta_data_exists:
                 log_rank(
@@ -271,8 +276,15 @@ def __init__(
                     rank=0,
                 )
 
-            self.start_iteration_step = 0
-            self.consumed_train_samples = 0
+            data_stages = [
+                DataStageMetadata(
+                    name=stage.name, start_training_step=stage.start_training_step, consumed_train_samples=0
+                )
+                for stage in self.config.data_stages
+            ]
+            self.metadata: TrainingMetadata = TrainingMetadata(
+                consumed_train_samples=0, last_train_step=0, last_stage_idx=0, data_stages=data_stages
+            )
 
         # Setup tensorboard write and log writers on output rank
         self.logger_ranks = self.parallel_context.get_global_rank(
@@ -289,7 +301,7 @@ def __init__(
             self.micro_batch_size * self.n_micro_batches_per_batch * self.parallel_context.dp_pg.size()
         )
         self.sequence_length = self.config.tokens.sequence_length
-        self.iteration_step = self.start_iteration_step
+        self.iteration_step = self.metadata.last_train_step
         self.limit_val_batches = self.config.tokens.limit_val_batches
         # NOTE: the dataloader currently in use for the current training stage
         self.current_dataloader: Optional[DataLoader] = None
@@ -305,8 +317,10 @@ def post_init(self):
     def pre_training(self, *args, **kwargs):
         self._print_training_plan()
 
+        metadata: TrainingMetadata = self.metadata
+
         log_rank(
-            f"[Start training] datetime: {datetime.datetime.now()} | mbs: {self.micro_batch_size} | grad_accum: {self.n_micro_batches_per_batch} | global_batch_size: {self.global_batch_size} | sequence_length: {self.sequence_length} | train_steps: {self.config.tokens.train_steps} | start_iteration_step: {self.start_iteration_step} | consumed_train_samples: {self.consumed_train_samples}",  # noqa
+            f"[Start training] datetime: {datetime.datetime.now()} | mbs: {self.micro_batch_size} | grad_accum: {self.n_micro_batches_per_batch} | global_batch_size: {self.global_batch_size} | sequence_length: {self.sequence_length} | train_steps: {self.config.tokens.train_steps} | start_iteration_step: {metadata.last_train_step} | consumed_train_samples: {metadata.consumed_train_samples}",  # noqa
             logger=logger,
             level=logging.INFO,
             rank=0,
@@ -380,23 +394,45 @@ def clear_dataloader_from_memory(dataloader: DataLoader, stage_name: str):
             gc.collect()
 
         dataloader = None
-        for stage_id, stage in enumerate(self.config.data_stages):
+
+        def find_stage_idx_to_resume():
+            reversed_data_stages = sorted(self.config.data_stages, key=lambda x: x.start_training_step, reverse=True)
+            for idx, stage in enumerate(reversed_data_stages):
+                if self.iteration_step >= stage.start_training_step:
+                    return len(self.config.data_stages) - idx - 1
+            return None
+
+        stage_idx_to_resume = find_stage_idx_to_resume()
+
+        for stage_idx, stage in enumerate(self.config.data_stages):
+            if stage_idx < self.metadata.last_stage_idx:
+                continue
+
             stage = cast(DatasetStageArgs, stage)
 
-            if stage.start_training_step == self.iteration_step:
+            is_resume_from_training = self.current_dataloader is None and stage_idx_to_resume == stage_idx
+            if (stage.start_training_step == self.iteration_step) or is_resume_from_training:
                 if self.current_dataloader is not None:
-                    prev_stage_name = self.config.data_stages[stage_id - 1].name
+                    prev_stage_name = self.config.data_stages[stage_idx - 1].name
                     prev_dataloader = dataloaders[prev_stage_name]
+
                     if isinstance(prev_dataloader, DataLoader):
                         # NOTE: we don't need to clear dummy data generator from memory
                         clear_dataloader_from_memory(prev_dataloader, stage_name=stage.name)
 
-                log_rank(
-                    f"[Training Stage: {stage.name}] Switching to a new dataset",
-                    logger=logger,
-                    level=logging.INFO,
-                    rank=0,
-                )
+                self.metadata.last_stage_idx = stage_idx
+
+                if is_resume_from_training:
+                    remaining_train_steps = compute_remain_train_steps_of_a_data_stage_from_ckp(
+                        stage, self.config, self.metadata
+                    )
+                    consumed_train_steps = get_consumed_train_samples_of_a_data_stage_from_ckp(stage, self.metadata)
+                    log_rank(
+                        f"Resuming training from stage {stage.name}, it has trained for {consumed_train_steps} samples and has {remaining_train_steps} remaining train steps",
+                        logger=logger,
+                        level=logging.INFO,
+                        rank=0,
+                    )
 
                 dataloader = dataloaders[stage.name]
                 # NOTE: if a dataloader is lazy initialized, we need to call it to initialize it
@@ -465,7 +501,12 @@ def train(
                 outputs, loss_avg = self.training_step(dataloader=self.current_dataloader)
 
                 # Training Logs
-                self.consumed_train_samples += self.global_batch_size
+                # TODO(xrsrke): refactor using callbacks would be better
+                self.metadata.consumed_train_samples += self.global_batch_size
+                self.metadata.last_train_step = self.iteration_step
+                self.metadata.data_stages[
+                    self.metadata.last_stage_idx
+                ].consumed_train_samples += self.global_batch_size
 
                 if (self.iteration_step - 1) % self.config.logging.iteration_step_info_interval == 0:
                     self.train_step_logs(outputs=outputs, loss_avg=loss_avg)
@@ -621,7 +662,9 @@ def train_step_logs(
             log_entries = [
                 # LogItem("consumed_samples", self.consumed_train_samples, "human_format"),  # , "12d"),
                 LogItem(
-                    "consumed_tokens", self.consumed_train_samples * self.config.tokens.sequence_length, "human_format"
+                    "consumed_tokens",
+                    self.metadata.consumed_train_samples * self.config.tokens.sequence_length,
+                    "human_format",
                 ),  # , "12d"),
                 LogItem("elapsed_time_per_iteration_ms", elapsed_time_per_iteration_ms, "human_format"),  # , ".1f"),
                 LogItem("tokens_per_sec", tokens_per_sec, "human_format"),  # , "1.6E"),
@@ -708,19 +751,12 @@ def init_model(self) -> Union[NanotronModel, DistributedDataParallel]:
                 )
             else:
                 log_rank(
-                    f"max_position_embeddings is {self.model_config.max_position_embeddings} and sequence_length is {self.config.tokens.sequence_length}. But i don't set it here",
+                    f"Setting max_position_embeddings to {self.config.tokens.sequence_length}. Previous value was {self.model_config.max_position_embeddings}.",
                     logger=logger,
                     level=logging.INFO,
                     rank=0,
                 )
-                # log_rank(
-                #     f"Setting max_position_embeddings to {self.config.tokens.sequence_length}. Previous value was {self.model_config.max_position_embeddings}.",
-                #     logger=logger,
-                #     level=logging.INFO,
-                #     rank=0,
-                # )
-                # self.model_config.max_position_embeddings = self.config.tokens.sequence_length
-                pass
+                self.model_config.max_position_embeddings = self.config.tokens.sequence_length
 
         log_rank("Config:\n" + pformat(self.config), logger=logger, level=logging.INFO, rank=0)
         log_rank("Model Config:\n" + pformat(self.model_config), logger=logger, level=logging.INFO, rank=0)
@@ -909,15 +945,10 @@ def save_checkpoint(self) -> Path:
         dist.barrier(self.parallel_context.world_pg)
 
         log_rank(f"Saving checkpoint at {checkpoint_path}", logger=logger, level=logging.WARNING, rank=0)
-        checkpoint_metadata = {
-            "last_train_step": self.iteration_step,
-            # TODO: @nouamanetazi: Add more metadata to the checkpoint to be able to resume dataloader states properly
-            "consumed_train_samples": self.consumed_train_samples,
-        }
 
         # Update step/samples numbers before we save the config
-        self.config.general.step = self.iteration_step
-        self.config.general.consumed_train_samples = self.consumed_train_samples
+        self.config.general.step = self.metadata.last_train_step
+        self.config.general.consumed_train_samples = self.metadata.consumed_train_samples
 
         save(
             model=self.unwrapped_model,
@@ -935,7 +966,7 @@ def save_checkpoint(self) -> Path:
             ),  # We only save the config on world_rank==0
             parallel_context=self.parallel_context,
             root_folder=checkpoint_path,
-            checkpoint_metadata=checkpoint_metadata,
+            training_metadata=self.metadata,
             config=self.config,
         )
         save_random_states(
@@ -945,9 +976,9 @@ def save_checkpoint(self) -> Path:
             fo.write(f"{self.iteration_step}")
 
         if hasattr(self.model_config, "to_json_file"):
-            self.model_config.to_json_file(checkpoint_path / "model_config.json")
+            self.model_config.to_json_file(checkpoint_path / MODEL_CONFIG_FILE_NAME)
         else:
-            with open(checkpoint_path / "model_config.json", mode="w") as fo:
+            with open(checkpoint_path / MODEL_CONFIG_FILE_NAME, mode="w") as fo:
                 fo.write(json.dumps(asdict(self.model_config)))
 
         self.post_save_checkpoint()
diff --git a/tests/helpers/data.py b/tests/helpers/data.py
new file mode 100644
index 00000000..33bb2480
--- /dev/null
+++ b/tests/helpers/data.py
@@ -0,0 +1,171 @@
+import hashlib
+import importlib
+import json
+import os
+import sys
+from collections import OrderedDict
+from pathlib import Path
+
+package = importlib.import_module("nanotron")
+package_path = Path(package.__file__).parent.parent.parent
+sys.path.append(str(package_path))
+
+from argparse import Namespace
+
+import nanotron.distributed as dist
+import torch
+from nanotron.data.nanoset import Nanoset
+from nanotron.parallel import ParallelContext
+from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
+from nanotron.sanity_checks import assert_tensor_synced_across_pg
+
+from tools.preprocess_data import main
+
+
+def create_dataset_paths(tmp_dir: str, quantity: int):
+    json_dataset_path = [os.path.join(tmp_dir, f"pytest_{i}") for i in range(quantity)]
+    mmap_dataset_path = [f"{path}_input_ids.npy" for path in json_dataset_path]
+
+    return json_dataset_path, mmap_dataset_path
+
+
+def create_dummy_json_dataset(path_to_json: str, dummy_text: str, n_samples: int = 50000):
+
+    with open(path_to_json + ".json", "a") as json_file:
+        for sample in range(n_samples):
+            sample_dict = {"text": f"[{sample}] Hello! Im sample {sample}! And this is my dummy text: {dummy_text}"}
+            json_file.write(json.dumps(sample_dict))
+            json_file.write("\n")
+
+
+def preprocess_dummy_dataset(path_to_json: str, tokenizer: str):
+    # Create args for preprocessing
+    args = Namespace(
+        input=path_to_json + ".json",
+        column="text",
+        output_prefix=path_to_json,
+        tokenizer_name_or_path=tokenizer,
+        add_special_tokens=False,
+    )
+
+    # tools/preprocess_data.py main
+    main(args)
+
+
+def assert_batch_dataloader(
+    batch: dict, parallel_context: ParallelContext, micro_batch_size: int, sequence_length: int
+):
+    """
+    batch (dict): Batch produced from the Dataloader, with keys input_ids, input_mask, label_ids, label_mask
+
+    """
+    for element in batch:
+        tensor = batch[element]
+
+        # Assert that inputs are only present in input_pp_rank and outputs in output_pp_rank
+        input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
+        if dist.get_rank(parallel_context.pp_pg) == input_pp_rank and element.startswith("input_"):
+            assert isinstance(tensor, torch.Tensor)
+        elif dist.get_rank(parallel_context.pp_pg) == output_pp_rank and element.startswith("label_"):
+            assert isinstance(tensor, torch.Tensor)
+        else:
+            assert isinstance(tensor, TensorPointer)
+
+        data_class = (
+            0  # 0 if tensor is from the ids, 1 if TensorPointer and 2 if mask. Used in the data parallel group check
+        )
+
+        # Check shape of mask and ids tensors
+        if isinstance(tensor, torch.Tensor):
+            assert tensor.shape == (micro_batch_size, sequence_length)
+
+        # TensorPointer case: Check that all TensorPointers from the same tp_pg point to the same group_rank. Create torch.tensor with group_rank
+        if isinstance(tensor, TensorPointer):
+            tensor = torch.tensor(tensor.group_rank)
+            data_class = 1
+
+        # Attention Masks case: dtype is torch.bool --> Transform to int64
+        if tensor.dtype == torch.bool:
+            tensor = tensor.long()
+            data_class = 2
+
+        # Assert that we have the SAME element in all the processes belonging to the same tensor parallel group
+        assert_tensor_synced_across_pg(
+            tensor=tensor.flatten().cuda(),
+            pg=parallel_context.tp_pg,
+            msg=lambda err: f"{element} is not synchronized across TP {err}",
+        )
+
+        # Assert that we have the SAME class of data in all processes belonging to the same data parallel group
+        assert_tensor_synced_across_pg(
+            tensor=torch.tensor(data_class, device="cuda"),
+            pg=parallel_context.dp_pg,
+            msg=lambda err: f"{element} is not synchronized across DP {err}",
+        )
+
+
+def compute_hash(identifier: OrderedDict, n_digit: int = 8) -> int:
+    """
+    Creates a sha256 hash from the elements of a OrderedDict
+    """
+    unique_description = json.dumps(identifier, indent=4)
+    # Create n_digit description hash
+    unique_description_hash = int(hashlib.sha256(unique_description.encode("utf-8")).hexdigest(), 16) % 10**n_digit
+    return unique_description_hash
+
+
+def assert_nanoset_sync_across_all_ranks(nanoset: Nanoset, parallel_context: ParallelContext):
+    """
+    Checks that the same Nanoset is created in all processes
+    """
+    # Extract a sample from the Nanoset
+    IDX_SAMPLE = 23
+
+    nanoset_identifiers = OrderedDict()
+    nanoset_identifiers["dataset_paths"] = nanoset.dataset_paths
+    nanoset_identifiers["dataset_weights"] = nanoset.dataset_weights.tolist()
+    nanoset_identifiers["sequence_length"] = nanoset.sequence_length
+    nanoset_identifiers["train_split_num_samples"] = nanoset.train_split_num_samples
+    nanoset_identifiers["random_seed"] = nanoset.random_seed
+    nanoset_identifiers["length"] = len(nanoset)
+    nanoset_identifiers["input_ids"] = nanoset[IDX_SAMPLE]["input_ids"].tolist()
+    nanoset_identifiers["dataset_index"] = nanoset.dataset_index.tolist()
+    nanoset_identifiers["dataset_sample_index"] = nanoset.dataset_sample_index.tolist()
+
+    unique_description_hash = compute_hash(nanoset_identifiers)
+    assert_tensor_synced_across_pg(
+        tensor=torch.tensor(unique_description_hash, device="cuda"),
+        pg=parallel_context.world_pg,
+        msg=lambda err: f"Nanoset is not synchronized across all processes {err}",
+    )
+
+
+def compute_batch_hash(batch: dict) -> int:
+    """
+    Checks that the Nanoset/BlendedNanoset is in the same state after recovering from a crash
+
+    batch (dict): Batch produced from the Dataloader, with keys input_ids, input_mask, label_ids, label_mask
+
+    """
+    batch_identifiers = OrderedDict()
+
+    for element in batch:
+        tensor = batch[element]
+
+        # TensorPointer
+        if isinstance(tensor, TensorPointer):
+            identifier = tensor.group_rank
+
+        # Attention Masks case: dtype is torch.bool --> Transform to int64
+        elif tensor.dtype == torch.bool:
+            identifier = tensor.long().tolist()
+
+        # Input IDs tensor
+        else:
+            identifier = tensor.tolist()
+
+        batch_identifiers[element] = identifier
+
+    unique_description_hash = compute_hash(batch_identifiers)
+
+    return unique_description_hash
diff --git a/tests/nanoset/test_build_nanoset_dataloader.py b/tests/nanoset/test_build_nanoset_dataloader.py
new file mode 100644
index 00000000..2c3ff542
--- /dev/null
+++ b/tests/nanoset/test_build_nanoset_dataloader.py
@@ -0,0 +1,281 @@
+import sys
+from math import isclose
+from pathlib import Path
+
+package_path = Path(__file__).parent.parent
+sys.path.append(str(package_path))
+
+import numpy as np
+import pytest
+from helpers.context import TestContext
+from helpers.data import (
+    assert_batch_dataloader,
+    assert_nanoset_sync_across_all_ranks,
+    compute_batch_hash,
+    create_dataset_paths,
+    create_dummy_json_dataset,
+    preprocess_dummy_dataset,
+)
+from helpers.utils import available_gpus, get_all_3d_configurations, init_distributed, rerun_if_address_is_in_use
+from nanotron.data.dataloader_builder import build_nanoset_dataloader
+from nanotron.data.nanoset import Nanoset
+from nanotron.data.utils import count_dataset_indexes, normalize
+from nanotron.parallel import ParallelContext
+from nanotron.utils import main_rank_first
+from transformers import AutoTokenizer
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@pytest.mark.parametrize("train_steps", [5, 100])
+@pytest.mark.parametrize("sequence_length", [512, 8192])
+@pytest.mark.parametrize("tokenizer_name_or_path", ["openai-community/gpt2", "unsloth/llama-3-8b-bnb-4bit"])
+@rerun_if_address_is_in_use()
+def test_build_nanoset_dataloader(
+    tp: int, dp: int, pp: int, train_steps: int, sequence_length: int, tokenizer_name_or_path: str
+):
+    test_context = TestContext()
+
+    # Create dataset files
+    json_paths, mmap_dataset_paths = create_dataset_paths(tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2)
+
+    # Create dummy json datasets
+    for idx, json_path in enumerate(json_paths):
+        create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_build_nanoset_dataloader)(
+        json_paths=json_paths,
+        path_to_mmap_files=mmap_dataset_paths,
+        train_steps=train_steps,
+        sequence_length=sequence_length,
+        tokenizer_name_or_path=tokenizer_name_or_path,
+    )
+
+
+def _test_build_nanoset_dataloader(
+    parallel_context: ParallelContext,
+    json_paths: str,
+    path_to_mmap_files: str,
+    train_steps: int,
+    sequence_length: int,
+    tokenizer_name_or_path: str,
+):
+    SEED = 1234
+    MICRO_BATCH_SIZE = 4
+    N_MICRO_BATCHES_PER_BATCH = 8
+    GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
+
+    # Preprocess dummy json datasets
+    for json_path in json_paths:
+        preprocess_dummy_dataset(path_to_json=json_path, tokenizer=tokenizer_name_or_path)
+
+    input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
+
+    # Get tokenizer cardinality
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+    token_dtype = np.int32 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else np.uint16
+    del tokenizer
+
+    # Create Nanoset configs: 1. Normal 2. Blended 3. Blended with weights
+    nanoset_config = {
+        "dataset_paths": [path_to_mmap_files[0]],
+        "dataset_weights": [1],
+        "sequence_length": sequence_length,
+        "token_dtype": token_dtype,
+        "train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    blended_nanoset_config = {
+        "dataset_paths": [path_to_mmap_files[0], path_to_mmap_files[1]],
+        "dataset_weights": None,
+        "sequence_length": sequence_length,
+        "token_dtype": token_dtype,
+        "train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    blended_weighted_nanoset_config = {
+        "dataset_paths": [path_to_mmap_files[0], path_to_mmap_files[1]],
+        "dataset_weights": [8, 2],
+        "sequence_length": sequence_length,
+        "token_dtype": token_dtype,
+        "train_split_num_samples": train_steps * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    configs = [nanoset_config, blended_nanoset_config, blended_weighted_nanoset_config]
+
+    for config in configs:
+        # Create Nanoset
+        with main_rank_first(parallel_context.world_pg):
+            train_dataset = Nanoset(**config)
+
+        # Assert we have the same Nanoset in all ranks
+        assert_nanoset_sync_across_all_ranks(train_dataset, parallel_context)
+        dataset_sample_count = count_dataset_indexes(train_dataset.dataset_index, len(train_dataset.dataset_paths))
+        for idx, ds_length in enumerate(train_dataset.dataset_lengths):
+            # Assert Nanoset doesn't sample indexes greater than the datasets
+            assert (
+                np.max(train_dataset.dataset_sample_index, where=train_dataset.dataset_index == idx, initial=-1)
+                < ds_length
+            ), f"Error building Nanoset Indexes: Tryng to access sample {np.max(train_dataset.dataset_sample_index, where=train_dataset.dataset_index==idx, initial = -1)} of a {ds_length} sample dataset"
+            # Assert Nanoset builds up the correct blend WRT the dataset_weights
+            assert isclose(
+                normalize(dataset_sample_count).tolist()[idx], train_dataset.dataset_weights[idx], abs_tol=0.05
+            ), f"Requested Nanoset to contain {round(train_dataset.dataset_weights[idx]*100, 2)}% of samples from {train_dataset.dataset_paths[idx]} but got {round(normalize(dataset_sample_count).tolist()[idx]*100, 2)}%"
+        # Create Dataloaders
+        dataloader = build_nanoset_dataloader(
+            train_dataset,
+            sequence_length=sequence_length,
+            parallel_context=parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            dataloader_num_workers=0,
+            dataloader_drop_last=True,
+        )
+
+        # Check a batch produced by the Dataloader
+        batch = next(iter(dataloader))
+        assert_batch_dataloader(
+            batch=batch,
+            parallel_context=parallel_context,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            sequence_length=sequence_length,
+        )
+
+    parallel_context.destroy()
+
+
+@pytest.mark.parametrize(
+    "tp,dp,pp",
+    [
+        pytest.param(*all_3d_configs)
+        for gpus in range(1, min(available_gpus(), 4) + 1)
+        for all_3d_configs in get_all_3d_configurations(gpus)
+    ],
+)
+@pytest.mark.parametrize("skipped_batches", [20, 50])
+@pytest.mark.parametrize("tokenizer_name_or_path", ["openai-community/gpt2", "unsloth/llama-3-8b-bnb-4bit"])
+@rerun_if_address_is_in_use()
+def test_recover_nanoset_dataloader(tp: int, dp: int, pp: int, skipped_batches: int, tokenizer_name_or_path: str):
+    test_context = TestContext()
+
+    # Create dataset files
+    json_paths, mmap_dataset_paths = create_dataset_paths(tmp_dir=test_context.get_auto_remove_tmp_dir(), quantity=2)
+
+    # Create dummy json datasets
+    for idx, json_path in enumerate(json_paths):
+        create_dummy_json_dataset(path_to_json=json_path, dummy_text=f"Nanoset {idx}!", n_samples=(idx + 1) * 50000)
+
+    init_distributed(tp=tp, dp=dp, pp=pp)(_test_recover_nanoset_dataloader)(
+        json_paths=json_paths,
+        path_to_mmap_files=mmap_dataset_paths,
+        skipped_batches=skipped_batches,
+        tokenizer_name_or_path=tokenizer_name_or_path,
+    )
+
+
+def _test_recover_nanoset_dataloader(
+    parallel_context: ParallelContext,
+    json_paths: str,
+    path_to_mmap_files: str,
+    skipped_batches: int,
+    tokenizer_name_or_path: str,
+):
+    SEED = 1234
+    MICRO_BATCH_SIZE = 4
+    N_MICRO_BATCHES_PER_BATCH = 8
+    GLOBAL_BATCH_SIZE = MICRO_BATCH_SIZE * N_MICRO_BATCHES_PER_BATCH * parallel_context.dp_pg.size()
+    SEQUENCE_LENGTH = 1024
+    TRAIN_STEPS = 100
+
+    # Preprocess dummy json datasets
+    for json_path in json_paths:
+        preprocess_dummy_dataset(path_to_json=json_path, tokenizer=tokenizer_name_or_path)
+
+    input_pp_rank, output_pp_rank = 0, int(parallel_context.pp_pg.size() - 1)
+
+    # Get tokenizer cardinality
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
+    token_dtype = np.int32 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else np.uint16
+    del tokenizer
+
+    # Create Nanoset configs: 1. Normal 2. Blended 3. Blended with weights
+    nanoset_config = {
+        "dataset_paths": [path_to_mmap_files[0]],
+        "dataset_weights": [1],
+        "sequence_length": SEQUENCE_LENGTH,
+        "token_dtype": token_dtype,
+        "train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    blended_nanoset_config = {
+        "dataset_paths": [path_to_mmap_files[0], path_to_mmap_files[1]],
+        "dataset_weights": None,
+        "sequence_length": SEQUENCE_LENGTH,
+        "token_dtype": token_dtype,
+        "train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    blended_weighted_nanoset_config = {
+        "dataset_paths": [path_to_mmap_files[0], path_to_mmap_files[1]],
+        "dataset_weights": [8, 2],
+        "sequence_length": SEQUENCE_LENGTH,
+        "token_dtype": token_dtype,
+        "train_split_num_samples": TRAIN_STEPS * GLOBAL_BATCH_SIZE,
+        "random_seed": SEED,
+    }
+
+    configs = [nanoset_config, blended_nanoset_config, blended_weighted_nanoset_config]
+
+    for config in configs:
+        # Create Nanoset
+        with main_rank_first(parallel_context.world_pg):
+            train_dataset = Nanoset(**config)
+
+        # Create initial Dataloader
+        dataloader = build_nanoset_dataloader(
+            train_dataset,
+            sequence_length=SEQUENCE_LENGTH,
+            parallel_context=parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            dataloader_num_workers=0,
+            dataloader_drop_last=True,
+        )
+
+        # Recover from failures
+        dataloader = iter(dataloader)
+        for _ in range(skipped_batches + 1):  # In order to compare with the first batch of the recovered DataLoader
+            batch = next(dataloader)
+
+        # Create recover Dataloader
+        recovered_dataloader = build_nanoset_dataloader(
+            train_dataset,
+            sequence_length=SEQUENCE_LENGTH,
+            parallel_context=parallel_context,
+            input_pp_rank=input_pp_rank,
+            output_pp_rank=output_pp_rank,
+            micro_batch_size=MICRO_BATCH_SIZE,
+            dataloader_num_workers=0,
+            dataloader_drop_last=True,
+            # NOTE The dataloader serves batches of micro_batch_size despite of batch_accumulation_per_replica
+            consumed_train_samples=skipped_batches * MICRO_BATCH_SIZE * parallel_context.dp_pg.size(),
+        )
+
+        recovered_first_batch = next(iter(recovered_dataloader))
+
+        assert compute_batch_hash(batch) == compute_batch_hash(recovered_first_batch)
+
+    parallel_context.destroy()
diff --git a/tests/test_optimizer_params_groups.py b/tests/test_optimizer_params_groups.py
new file mode 100644
index 00000000..fa835e1c
--- /dev/null
+++ b/tests/test_optimizer_params_groups.py
@@ -0,0 +1,581 @@
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from helpers.utils import available_gpus, init_distributed, rerun_if_address_is_in_use
+from nanotron.optim.gradient_accumulator import FP32GradientAccumulator
+from nanotron.optim.named_optimizer import NamedOptimizer
+from nanotron.optim.optimizer_from_gradient_accumulator import OptimizerFromGradientAccumulator
+from nanotron.parallel.context import ParallelContext
+from nanotron.parallel.parameters import NanotronParameter
+from nanotron.random import set_random_seed
+
+
+class DummyModel(nn.Module):
+    def __init__(self, dtype=torch.float32):
+        super(DummyModel, self).__init__()
+        self.fc1 = nn.Linear(10, 20, bias=False).to(dtype=dtype)
+        self.fc2 = nn.Linear(20, 2, bias=False).to(dtype=dtype)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        return x
+
+
+def test_optimizer_lr_one_group():
+    set_random_seed(42)
+
+    model = DummyModel().to("cuda")
+
+    lr1 = 0.1
+
+    named_params_or_groups = []
+    for name, param in model.named_parameters():
+        named_params_or_groups.append((name, param))
+    named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1}]
+
+    optimizer = NamedOptimizer(
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=lambda param_groups: optim.SGD(
+            param_groups,
+            lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+        ),
+    )
+
+    input = torch.randn(10, 10).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for _ in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output, target)
+        loss.backward()
+
+        fc1_grad = model.fc1.weight.grad.clone()
+        fc2_grad = model.fc2.weight.grad.clone()
+
+        # compute gradient manually
+        with torch.no_grad():
+            expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
+            expected_fc2_weight = model.fc2.weight - lr1 * fc2_grad
+
+        optimizer.step()
+
+        updated_fc1_weight = model.fc1.weight
+        updated_fc2_weight = model.fc2.weight
+
+        torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+        torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+def test_optimizer_lr_multiple_group():
+    set_random_seed(42)
+
+    model = DummyModel().to("cuda")
+
+    lr1, lr2 = 0.1, 0.001
+
+    named_params_or_groups = [
+        {"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name], "lr": lr1},
+        {"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name], "lr": lr2},
+    ]
+
+    optimizer = NamedOptimizer(
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=lambda param_groups: optim.SGD(
+            param_groups,
+            lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+        ),
+    )
+
+    input = torch.randn(10, 10).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for _ in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output, target)
+        loss.backward()
+
+        fc1_grad = model.fc1.weight.grad.clone()
+        fc2_grad = model.fc2.weight.grad.clone()
+
+        with torch.no_grad():
+            expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
+            expected_fc2_weight = model.fc2.weight - lr2 * fc2_grad
+
+        optimizer.step()
+
+        updated_fc1_weight = model.fc1.weight
+        updated_fc2_weight = model.fc2.weight
+
+        torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+        torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+def test_optimizer_lr_weight_decay_one_group():
+    set_random_seed(42)
+
+    model = DummyModel().to("cuda")
+
+    lr1 = 0.1
+    weight_decay = 0.1
+
+    named_params_or_groups = []
+    for name, param in model.named_parameters():
+        named_params_or_groups.append((name, param))
+    named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1, "weight_decay": weight_decay}]
+
+    optimizer = NamedOptimizer(
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=lambda param_groups: optim.SGD(
+            param_groups,
+            lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+        ),
+    )
+
+    input = torch.randn(10, 10).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for _ in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output, target)
+        loss.backward()
+
+        # Compute gradient manually and apply weight decay
+        with torch.no_grad():
+            expected_fc1_weight = (1 - lr1 * weight_decay) * model.fc1.weight - lr1 * model.fc1.weight.grad
+            expected_fc2_weight = (1 - lr1 * weight_decay) * model.fc2.weight - lr1 * model.fc2.weight.grad
+
+        optimizer.step()
+
+        updated_fc1_weight = model.fc1.weight
+        updated_fc2_weight = model.fc2.weight
+
+        torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+        torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+def test_optimizer_lr_weight_decay_multiple_group():
+    set_random_seed(42)
+
+    model = DummyModel().to("cuda")
+
+    lr1, lr2 = 0.1, 0.001
+    weight_decay1, weight_decay2 = 0.1, 0.001
+
+    named_params_or_groups = [
+        {
+            "named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name],
+            "lr": lr1,
+            "weight_decay": weight_decay1,
+        },
+        {
+            "named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name],
+            "lr": lr2,
+            "weight_decay": weight_decay2,
+        },
+    ]
+
+    optimizer = NamedOptimizer(
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=lambda param_groups: optim.SGD(
+            param_groups,
+            lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+        ),
+    )
+
+    input = torch.randn(10, 10).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for _ in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output, target)
+        loss.backward()
+
+        # Compute gradient manually and apply weight decay
+        with torch.no_grad():
+            expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * model.fc1.weight.grad
+            expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * model.fc2.weight.grad
+
+        optimizer.step()
+
+        updated_fc1_weight = model.fc1.weight
+        updated_fc2_weight = model.fc2.weight
+
+        torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+        torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+def test_optimizer_grad_accumulation_lr_one_group(half_precision: torch.dtype, accumulation_steps: int):
+    set_random_seed(42)
+    dtype = half_precision
+    lr1 = 0.1
+
+    model = DummyModel(dtype=dtype).to("cuda")
+
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    named_params_or_groups = []
+    for name, param in model.named_parameters():
+        named_params_or_groups.append((name, param))
+
+    named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1}]
+
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
+                expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
+                expected_fc2_weight = model.fc2.weight - lr1 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+def test_optimizer_grad_accumulation_lr_multiple_group(half_precision: torch.dtype, accumulation_steps: int):
+    set_random_seed(42)
+    dtype = half_precision
+    lr1, lr2 = 0.1, 0.001
+
+    model = DummyModel(dtype=dtype).to("cuda")
+
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    named_params_or_groups = [
+        {"named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name], "lr": lr1},
+        {"named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name], "lr": lr2},
+    ]
+
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that should be overwritten by the lr in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
+                expected_fc1_weight = model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
+                expected_fc2_weight = model.fc2.weight - lr2 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+def test_optimizer_grad_accumulation_lr_weight_decay_one_group(half_precision: torch.dtype, accumulation_steps: int):
+    set_random_seed(42)
+    dtype = half_precision
+    lr1 = 0.1
+    weight_decay = 0.1
+
+    model = DummyModel(dtype=dtype).to("cuda")
+
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    named_params_or_groups = []
+    for name, param in model.named_parameters():
+        named_params_or_groups.append((name, param))
+    named_params_or_groups = [{"named_params": named_params_or_groups, "lr": lr1, "weight_decay": weight_decay}]
+
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
+                weight_decay=9999999,  # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
+                expected_fc1_weight = (1 - lr1 * weight_decay) * model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
+                expected_fc2_weight = (1 - lr1 * weight_decay) * model.fc2.weight - lr1 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+def test_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
+    half_precision: torch.dtype, accumulation_steps: int
+):
+    set_random_seed(42)
+    dtype = half_precision
+    lr1, lr2 = 0.1, 0.001
+    weight_decay1, weight_decay2 = 0.1, 0.001
+
+    model = DummyModel(dtype=dtype).to("cuda")
+
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    named_params_or_groups = [
+        {
+            "named_params": [(name, param) for name, param in model.named_parameters() if "fc1" in name],
+            "lr": lr1,
+            "weight_decay": weight_decay1,
+        },
+        {
+            "named_params": [(name, param) for name, param in model.named_parameters() if "fc2" in name],
+            "lr": lr2,
+            "weight_decay": weight_decay2,
+        },
+    ]
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
+                weight_decay=9999999,  # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="fc1.weight").to(dtype)
+                expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="fc2.weight").to(dtype)
+                expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
+
+
+@pytest.mark.skipif(available_gpus() < 2, reason="Testing requires at least 2 gpus")
+@pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("accumulation_steps", [1, 10])
+@rerun_if_address_is_in_use()
+def test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
+    half_precision: torch.dtype, accumulation_steps: int
+):
+    init_distributed(tp=1, dp=2, pp=1)(_test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group)(
+        half_precision=half_precision,
+        accumulation_steps=accumulation_steps,
+    )
+
+
+def _test_ddp_optimizer_grad_accumulation_lr_weight_decay_multiple_group(
+    parallel_context: ParallelContext, half_precision: torch.dtype, accumulation_steps: int
+):
+    set_random_seed(42)
+    dtype = half_precision
+    # Making it bigger so that the difference is more visible during update
+    lr1, lr2 = 0.04, 0.05
+    weight_decay1, weight_decay2 = 0.5, 0.2
+
+    model = DummyModel(dtype=dtype).to("cuda")
+    # Need to convert the weights to NanotronParameter for the gradient accumulation to work
+    model.fc1.weight = NanotronParameter(model.fc1.weight)
+    model.fc2.weight = NanotronParameter(model.fc2.weight)
+
+    model_ddp = torch.nn.parallel.DistributedDataParallel(
+        model,
+        process_group=parallel_context.dp_pg,
+    )
+
+    named_params_or_groups = [
+        {
+            "named_params": [(name, param) for name, param in model_ddp.named_parameters() if "fc1" in name],
+            "lr": lr1,
+            "weight_decay": weight_decay1,
+        },
+        {
+            "named_params": [(name, param) for name, param in model_ddp.named_parameters() if "fc2" in name],
+            "lr": lr2,
+            "weight_decay": weight_decay2,
+        },
+    ]
+    # Optimizer
+    def optimizer_builder(inp_param_groups):
+        return NamedOptimizer(
+            named_params_or_groups=inp_param_groups,
+            optimizer_builder=lambda param_groups: optim.SGD(
+                param_groups,
+                lr=9999999,  # this is a dummy value that will be overwritten by the lr in the named_params_or_groups
+                weight_decay=9999999,  # this is a dummy value that will be overwritten by the weight_decay in the named_params_or_groups
+            ),
+        )
+
+    optimizer = OptimizerFromGradientAccumulator(
+        gradient_accumulator_builder=lambda named_params: FP32GradientAccumulator(named_parameters=named_params),
+        named_params_or_groups=named_params_or_groups,
+        optimizer_builder=optimizer_builder,
+    )
+
+    accumulator = optimizer.gradient_accumulator
+
+    input = torch.randn(10, 10, dtype=dtype).to(device="cuda")
+    target = torch.randint(0, 2, (10,)).to(device="cuda")
+
+    for batch_idx in range(100):
+        optimizer.zero_grad()
+
+        output = model(input)
+        loss = F.cross_entropy(output.float(), target)
+        accumulator.backward(loss)
+
+        if (batch_idx + 1) % accumulation_steps == 0:
+
+            # Manual update weights for ref
+            with torch.no_grad():
+                fc1_grad = accumulator.get_grad_buffer(name="module.fc1.weight").to(dtype)
+                expected_fc1_weight = (1 - lr1 * weight_decay1) * model.fc1.weight - lr1 * fc1_grad
+
+                fc2_grad = accumulator.get_grad_buffer(name="module.fc2.weight").to(dtype)
+                expected_fc2_weight = (1 - lr2 * weight_decay2) * model.fc2.weight - lr2 * fc2_grad
+
+            optimizer.step()
+
+            updated_fc1_weight = model.fc1.weight
+            updated_fc2_weight = model.fc2.weight
+
+            torch.testing.assert_close(expected_fc1_weight, updated_fc1_weight)
+            torch.testing.assert_close(expected_fc2_weight, updated_fc2_weight)
diff --git a/tools/preprocess_data.py b/tools/preprocess_data.py
new file mode 100644
index 00000000..465d22f0
--- /dev/null
+++ b/tools/preprocess_data.py
@@ -0,0 +1,115 @@
+import argparse
+import os
+import shutil
+import sys
+
+import numpy as np
+import torch.distributed as dist
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+from datasets import concatenate_datasets, load_dataset
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    group = parser.add_argument_group(title="input data")
+    group.add_argument(
+        "--input", type=str, required=True, help="Path to local stored dataset or repository on the Hugging Face hub"
+    )
+    group.add_argument("--column", type=str, default="text", help="Column to preprocess from the Dataset")
+    parser.add_argument("--split", type=str, default="train", help="Which split of the data to process")
+
+    group = parser.add_argument_group(title="tokenizer")
+    group.add_argument(
+        "--tokenizer-name-or-path",
+        type=str,
+        required=True,
+        help="A path to a directory containing vocabulary files required by the tokenizer or the model id of a predefined tokenizer hosted inside a model repo on the Hugging Face Hub.",
+    )
+    group.add_argument(
+        "--add-special-tokens",
+        action="store_true",
+        help="Whether or not to add special tokens when encoding the sequences. This will be passed to the Tokenizer",
+    )
+
+    group = parser.add_argument_group(title="output data")
+    group.add_argument("--output-prefix", type=str, required=True, help="Path to the output processed dataset file")
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+
+    world_size, rank = int(os.environ["WORLD_SIZE"]), int(os.environ["RANK"])
+
+    # Remove stdout from all processes except main to not flood the stdout
+    if rank:
+        sys.stdout = open(os.devnull, "w")
+
+    # Check if output directory exists
+    if not os.path.isdir(os.path.abspath(os.path.join(args.output_prefix, os.path.pardir))):
+        print(f"Creating {os.path.abspath(os.path.join(args.output_prefix, os.path.pardir))} directory...")
+        os.makedirs(os.path.abspath(os.path.join(args.output_prefix, os.path.pardir)), exist_ok=True)
+
+    if args.input.endswith(".json"):  # For processing JSON files (Cross compatibility with other projects)
+        ds = load_dataset("json", data_files=args.input)
+        ds = concatenate_datasets(
+            [ds[splits] for splits in ds.keys()]
+        )  # load_dataset returns DatasetDict and we want a Dataset
+    else:
+        ds = load_dataset(args.input, split=args.split)
+
+    ds = ds.shard(num_shards=world_size, index=rank, contiguous=True)
+    ds = ds.select_columns(args.column)
+
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path)
+    token_dtype = np.int32 if len(tokenizer) > np.iinfo(np.uint16).max + 1 else np.uint16
+
+    # Create tmp directory for worker outputs
+    tmp_folder = os.path.abspath(os.path.join(args.output_prefix, os.pardir, "tmp"))
+    os.makedirs(tmp_folder, exist_ok=True)
+
+    print("Creating workers output files...")
+    worker_output_file = os.path.join(tmp_folder, f"worker_{rank}_input_ids.npy")
+    ds = ds.map(
+        lambda x: {"input_ids": tokenizer(x, add_special_tokens=args.add_special_tokens).input_ids},
+        input_columns=args.column,
+        batched=True,
+        desc="Tokenizing Dataset",
+        remove_columns=[args.column],
+    )
+
+    worker_input_ids_file = open(worker_output_file, "wb")
+    for sample in ds:
+        np_array = np.array(sample["input_ids"], dtype=token_dtype)
+        worker_input_ids_file.write(np_array.tobytes(order="C"))
+    worker_input_ids_file.close()
+
+    # Wait for all workers to process each shard of the Dataset
+    dist.barrier()
+
+    # Only the main rank merges the worker files
+    if not rank:
+        output_file = f"{args.output_prefix}_input_ids.npy"
+        input_ids_file = open(output_file, "wb")
+        for worker_idx in tqdm(range(world_size), desc="Merging workers output files"):
+            worker_output_file = os.path.join(tmp_folder, f"worker_{worker_idx}_input_ids.npy")
+            with open(worker_output_file, "rb") as f:
+                shutil.copyfileobj(f, input_ids_file)
+            os.remove(worker_output_file)
+
+        input_ids_file.close()
+        os.rmdir(tmp_folder)
+        print(f"Done! {args.input} processed dataset stored in {output_file}")
+
+    else:  # Close devnull stdout redirect
+        sys.stdout.close()
+
+
+if __name__ == "__main__":
+    _args = get_args()
+    dist.init_process_group(backend="gloo")
+    main(_args)