From ade1c1b8ae2dfa0162499b5359662d7c4d1ea2b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 13 Mar 2024 02:13:36 +0000 Subject: [PATCH] update --- litgpt/data/tinyllama.py | 12 ++++++------ tutorials/pretrain_tinyllama.md | 34 ++++++--------------------------- 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/litgpt/data/tinyllama.py b/litgpt/data/tinyllama.py index d67cc64c37..54d3ba9bca 100644 --- a/litgpt/data/tinyllama.py +++ b/litgpt/data/tinyllama.py @@ -35,9 +35,9 @@ class TinyLlama(LitDataModule): def __post_init__(self): # Could be a remote path (s3://) or a local path - self.slimpajama_train = str(self.data_path).rstrip("/") + "/slimpajama/train" - self.slimpajama_val = str(self.data_path).rstrip("/") + "/slimpajama/val" - self.starcoder_train = str(self.data_path).rstrip("/") + "/starcoder" + self.slimpajama_train = os.path.join(str(self.data_path), "slimpajama", "train") + self.slimpajama_val = os.path.join(str(self.data_path), "slimpajama", "val") + self.starcoder_train = os.path.join(str(self.data_path), "starcoder") def connect( self, @@ -60,17 +60,17 @@ def prepare_data(self) -> None: # ) prepare_slimpajama( - input_dir=os.path.join(self.data_path, "SlimPajama-627B/train"), + input_dir=os.path.join(self.data_path, "slimpajama-raw/train"), output_dir=self.slimpajama_train, tokenizer=self.tokenizer, ) prepare_slimpajama( - input_dir=os.path.join(self.data_path, "SlimPajama-627B/validation"), + input_dir=os.path.join(self.data_path, "slimpajama-raw/validation"), output_dir=self.slimpajama_val, tokenizer=self.tokenizer, ) prepare_starcoder( - input_dir=os.path.join(self.data_path, "starcoderdata"), + input_dir=os.path.join(self.data_path, "starcoderdata-raw"), output_dir=self.starcoder_train, tokenizer=self.tokenizer, ) diff --git a/tutorials/pretrain_tinyllama.md b/tutorials/pretrain_tinyllama.md index 245ec48ab7..ef53f63476 100644 --- a/tutorials/pretrain_tinyllama.md +++ b/tutorials/pretrain_tinyllama.md @@ -49,7 +49,7 @@ In order to start pretraining litgpt on it, you need to read, tokenize, and writ First, install additional dependencies for preprocessing: ```bash -pip install '.[all]' +pip install litgpt '.[all]' ``` You will need to have the tokenizer config available: @@ -61,38 +61,16 @@ litgpt download \ --tokenizer_only true ``` -Then, run the preprocessing script for each dataset and split. -You will require **1.1 TB** of disk space for Starcoder and **2.5** TB of space for the SlimPajama dataset. - -**Starcoder:** - -```bash -python litgpt/data/prepare_starcoder.py \ - --input_dir data/starcoderdata-raw \ - --output_dir data/starcoder \ - --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf -``` - -**SlimPajama:** +Then, run the preprocessing command by pointing to the directory where the data was downloaded. +You will require and additional **1.1 TB** of disk space for Starcoder and **2.5** TB of space for the SlimPajama dataset. ```bash -python litgpt/data/prepare_slimpajama.py \ - --input_dir data/slimpajama-raw/validation \ - --output_dir data/slimpajama/val \ - --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf - -python litgpt/data/prepare_slimpajama.py \ - --input_dir data/slimpajama-raw/test \ - --output_dir data/slimpajama/test \ - --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf - -python litgpt/data/prepare_slimpajama.py \ - --input_dir data/slimpajama-raw/train \ - --output_dir data/slimpajama/train \ +litgpt prepare \ + --data TinyLlama \ + --data.data_path data \ --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf ``` -If you want to run on a small slice of the datasets first, pass the flag `--fast_dev_run=true` to the commands above. In the above we are assuming that you will be using the same tokenizer as used in LlaMA/TinyLlama, but any trained [SentencePiece](https://github.com/google/sentencepiece) tokenizer with a 32000 vocabulary size will do here. ## Pretraining