From ade1c1b8ae2dfa0162499b5359662d7c4d1ea2b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= <aedu.waelchli@gmail.com>
Date: Wed, 13 Mar 2024 02:13:36 +0000
Subject: [PATCH] update

---
 litgpt/data/tinyllama.py        | 12 ++++++------
 tutorials/pretrain_tinyllama.md | 34 ++++++---------------------------
 2 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/litgpt/data/tinyllama.py b/litgpt/data/tinyllama.py
index d67cc64c37..54d3ba9bca 100644
--- a/litgpt/data/tinyllama.py
+++ b/litgpt/data/tinyllama.py
@@ -35,9 +35,9 @@ class TinyLlama(LitDataModule):
 
     def __post_init__(self):
         # Could be a remote path (s3://) or a local path
-        self.slimpajama_train = str(self.data_path).rstrip("/") + "/slimpajama/train"
-        self.slimpajama_val = str(self.data_path).rstrip("/") + "/slimpajama/val"
-        self.starcoder_train = str(self.data_path).rstrip("/") + "/starcoder"
+        self.slimpajama_train = os.path.join(str(self.data_path), "slimpajama", "train")
+        self.slimpajama_val = os.path.join(str(self.data_path), "slimpajama", "val")
+        self.starcoder_train = os.path.join(str(self.data_path), "starcoder")
 
     def connect(
         self,
@@ -60,17 +60,17 @@ def prepare_data(self) -> None:
         #         )
 
         prepare_slimpajama(
-            input_dir=os.path.join(self.data_path, "SlimPajama-627B/train"),
+            input_dir=os.path.join(self.data_path, "slimpajama-raw/train"),
             output_dir=self.slimpajama_train,
             tokenizer=self.tokenizer,
         )
         prepare_slimpajama(
-            input_dir=os.path.join(self.data_path, "SlimPajama-627B/validation"),
+            input_dir=os.path.join(self.data_path, "slimpajama-raw/validation"),
             output_dir=self.slimpajama_val,
             tokenizer=self.tokenizer,
         )
         prepare_starcoder(
-            input_dir=os.path.join(self.data_path, "starcoderdata"),
+            input_dir=os.path.join(self.data_path, "starcoderdata-raw"),
             output_dir=self.starcoder_train,
             tokenizer=self.tokenizer,
         )
diff --git a/tutorials/pretrain_tinyllama.md b/tutorials/pretrain_tinyllama.md
index 245ec48ab7..ef53f63476 100644
--- a/tutorials/pretrain_tinyllama.md
+++ b/tutorials/pretrain_tinyllama.md
@@ -49,7 +49,7 @@ In order to start pretraining litgpt on it, you need to read, tokenize, and writ
 First, install additional dependencies for preprocessing:
 
 ```bash
-pip install '.[all]'
+pip install litgpt '.[all]'
 ```
 
 You will need to have the tokenizer config available:
@@ -61,38 +61,16 @@ litgpt download \
    --tokenizer_only true
 ```
 
-Then, run the preprocessing script for each dataset and split.
-You will require **1.1 TB** of disk space for Starcoder and **2.5** TB of space for the SlimPajama dataset.
-
-**Starcoder:**
-
-```bash
-python litgpt/data/prepare_starcoder.py \
-  --input_dir data/starcoderdata-raw \
-  --output_dir data/starcoder \
-  --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
-```
-
-**SlimPajama:**
+Then, run the preprocessing command by pointing to the directory where the data was downloaded.
+You will require and additional **1.1 TB** of disk space for Starcoder and **2.5** TB of space for the SlimPajama dataset.
 
 ```bash
-python litgpt/data/prepare_slimpajama.py \
-  --input_dir data/slimpajama-raw/validation \
-  --output_dir data/slimpajama/val \
-  --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
-
-python litgpt/data/prepare_slimpajama.py \
-  --input_dir data/slimpajama-raw/test \
-  --output_dir data/slimpajama/test \
-  --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
-
-python litgpt/data/prepare_slimpajama.py \
-  --input_dir data/slimpajama-raw/train \
-  --output_dir data/slimpajama/train \
+litgpt prepare \
+  --data TinyLlama \
+  --data.data_path data \
   --tokenizer_path checkpoints/meta-llama/Llama-2-7b-hf
 ```
 
-If you want to run on a small slice of the datasets first, pass the flag `--fast_dev_run=true` to the commands above.
 In the above we are assuming that you will be using the same tokenizer as used in LlaMA/TinyLlama, but any trained [SentencePiece](https://github.com/google/sentencepiece) tokenizer with a 32000 vocabulary size will do here.
 
 ## Pretraining