Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
awaelchli committed Mar 13, 2024
1 parent c03d0ba commit 22803c0
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions litgpt/data/tinyllama.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def connect(
) -> None:
self.tokenizer = tokenizer
self.batch_size = batch_size
self.seq_length = max_seq_length + 1 # Increase by one because we need the next token as well
if max_seq_length:
self.seq_length = max_seq_length + 1 # Increase by one because we need the next token as well

def prepare_data(self) -> None:
# for path in (self.slimpajama_train, self.slimpajama_val, self.starcoder_train):
Expand All @@ -59,17 +60,17 @@ def prepare_data(self) -> None:
# )

prepare_slimpajama(
input_dir=(self.data_path / "SlimPajama-627B/train"), # TODO: double check folder name
input_dir=os.path.join(self.data_path, "SlimPajama-627B/train"),
output_dir=self.slimpajama_train,
tokenizer=self.tokenizer,
)
prepare_slimpajama(
input_dir=(self.data_path / "SlimPajama-627B/val"), # TODO: double check folder name
input_dir=os.path.join(self.data_path, "SlimPajama-627B/validation"),
output_dir=self.slimpajama_val,
tokenizer=self.tokenizer,
)
prepare_starcoder(
input_dir=(self.data_path / "starcoderdata"),
input_dir=os.path.join(self.data_path, "starcoderdata"),
output_dir=self.starcoder_train,
tokenizer=self.tokenizer,
)
Expand Down

0 comments on commit 22803c0

Please sign in to comment.