Skip to content

Commit

Permalink
lots more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
corbt committed Oct 27, 2024
1 parent d4c4c4d commit 39d434d
Show file tree
Hide file tree
Showing 14 changed files with 1,057 additions and 85 deletions.
6 changes: 6 additions & 0 deletions prepare_env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ add_to_bashrc "export HF_HOME=/workspace/.cache/huggingface"
add_to_bashrc "export HF_HUB_ENABLE_HF_TRANSFER=1"
add_to_bashrc "source /workspace/.env"

# Add uv to path
add_to_bashrc "export PATH=\"/root/.cargo/bin:\$PATH\""

# Enable CUDA debugging
add_to_bashrc "export CUDA_LAUNCH_BLOCKING=1"

source ~/.bashrc

# Install system dependencies
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ dependencies = [
"peft>=0.13.2",
"polars>=1.9.0",
"python-dotenv>=1.0.1",
"schedulefree>=1.2.7",
"scikit-learn>=1.5.2",
"seaborn>=0.13.2",
"sglang[all]>=0.3.3.post1",
Expand Down
262 changes: 194 additions & 68 deletions stories-analysis.ipynb

Large diffs are not rendered by default.

136 changes: 136 additions & 0 deletions stories_train_model_v10.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import torch
from datasets import load_dataset, Dataset
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
Trainer,
TrainingArguments,
)
from peft.tuners.lora import LoraConfig
from peft.mapping import get_peft_model
import wandb
from dotenv import load_dotenv
import polars as pl
from utils import stories_dataset
from liger_kernel.transformers import _apply_liger_kernel_to_instance
from training_helpers import compute_metrics

load_dotenv("/workspace/.env")

# Configuration
base_model = "google/gemma-2-9b"
run_name = __file__.split("/")[-1].replace(".py", "")
output_dir = f"./models/{run_name}"
num_epochs = 1
batch_size = 2
gradient_accumulation_steps = 8
learning_rate = 2e-4
max_length = 4096

# Initialize wandb
wandb.init(project="hn_stories_model_training", name=run_name)


def create_dataset(split, num_rows, tokenizer):
stories = stories_dataset()
stories = stories.filter(pl.col("split") == split).head(num_rows)

stories = stories.with_columns(
[
pl.col("serialized").alias("text"),
pl.col("log_score").alias("label"),
]
)

stories = stories.with_columns(
[
pl.col("text")
.map_elements(
lambda x: tokenizer(x)["input_ids"], return_dtype=pl.List(pl.Int64)
)
.alias("input_ids"),
]
).select(["input_ids", "label"])
return Dataset.from_polars(stories)


print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(
base_model,
truncation=True,
padding=True,
max_length=max_length,
)

model = AutoModelForSequenceClassification.from_pretrained(
base_model,
num_labels=1, # Regression task
device_map="auto",
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
)
_apply_liger_kernel_to_instance(model=model)

model.config.pad_token_id = tokenizer.pad_token_id
tokenizer.padding_side = "right"

print("Configuring LoRA...")
model = get_peft_model(
model,
LoraConfig(
task_type="SEQ_CLS",
r=8,
lora_alpha=16,
lora_dropout=0,
),
)

print("Loading dataset...")
train_stories = create_dataset("train", 1000000, tokenizer)
validation_stories = create_dataset("val", 1000, tokenizer)


# Configure training arguments
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
learning_rate=learning_rate,
weight_decay=0,
evaluation_strategy="steps",
eval_steps=0.05,
logging_steps=100,
save_strategy="steps",
save_steps=1000,
report_to="wandb",
no_cuda=False,
bf16=True,
warmup_steps=100,
gradient_accumulation_steps=gradient_accumulation_steps,
)


print("Initializing Trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_stories,
eval_dataset=validation_stories,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)

print("Running initial evaluation...")
results = trainer.evaluate()
print("Initial evaluation complete")
print(results)

print("Starting model training...")
trainer.train()

print("Saving final model...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("Stories model training complete")
7 changes: 1 addition & 6 deletions stories_train_model_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from utils import stories_dataset
from sklearn.metrics import mean_squared_error
from liger_kernel.transformers import _apply_liger_kernel_to_instance
from training_helpers import compute_metrics

load_dotenv("/workspace/.env")

Expand Down Expand Up @@ -110,12 +111,6 @@ def create_dataset(split, num_rows, tokenizer):
)


def compute_metrics(eval_pred):
predictions, labels = eval_pred
rmse = mean_squared_error(labels, predictions, squared=False)
return {"rmse": rmse}


print("Initializing Trainer...")
trainer = Trainer(
model=model,
Expand Down
11 changes: 2 additions & 9 deletions stories_train_model_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@
from dotenv import load_dotenv
import polars as pl
from utils import stories_dataset
from sklearn.metrics import mean_squared_error
from liger_kernel.transformers import _apply_liger_kernel_to_instance
from training_helpers import compute_metrics

load_dotenv("/workspace/.env")

# Configuration
base_model = "unsloth/Meta-Llama-3.1-8B"
run_name = "stories_model_v2"
run_name = __file__.split("/")[-1].replace(".py", "")
output_dir = f"./models/{run_name}"
num_epochs = 1
batch_size = 4
Expand Down Expand Up @@ -108,16 +108,9 @@ def create_dataset(split, num_rows, tokenizer):
bf16=True,
warmup_steps=100,
gradient_accumulation_steps=gradient_accumulation_steps,
# use_liger_kernel=True,
)


def compute_metrics(eval_pred):
predictions, labels = eval_pred
rmse = mean_squared_error(labels, predictions, squared=False)
return {"rmse": rmse}


print("Initializing Trainer...")
trainer = Trainer(
model=model,
Expand Down
5 changes: 3 additions & 2 deletions stories_train_model_v4.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@

# Configuration
base_model = "unsloth/Meta-Llama-3.1-8B"
run_name = "stories_model_v4"
run_name = "stories_model_schedulefree_v1"
output_dir = f"./models/{run_name}"
num_epochs = 2
num_epochs = 1
batch_size = 4
gradient_accumulation_steps = 4
learning_rate = 2e-4
Expand Down Expand Up @@ -104,6 +104,7 @@ def create_dataset(split, num_rows, tokenizer):
save_strategy="steps",
save_steps=1000,
report_to="wandb",
optim="schedule_free_adamw",
no_cuda=False,
bf16=True,
warmup_steps=100,
Expand Down
140 changes: 140 additions & 0 deletions stories_train_model_v5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import torch
from datasets import load_dataset, Dataset
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
Trainer,
TrainingArguments,
)
from peft.tuners.lora import LoraConfig
from peft.mapping import get_peft_model
import wandb
from dotenv import load_dotenv
import polars as pl
from utils import stories_dataset
from liger_kernel.transformers import _apply_liger_kernel_to_instance
from training_helpers import compute_metrics

load_dotenv("/workspace/.env")

# Configuration
base_model = "unsloth/Meta-Llama-3.1-8B"
run_name = "stories_train_model_v5"
output_dir = f"./models/{run_name}"
num_epochs = 1
batch_size = 4
gradient_accumulation_steps = 4
learning_rate = 2e-4
max_length = 4096

# Initialize wandb
wandb.init(project="hn_stories_model_training", name=run_name)


def create_dataset(split, num_rows, tokenizer):
stories = stories_dataset()
stories = stories.filter(pl.col("split") == split).head(num_rows)

stories = stories.with_columns(
[
pl.col("serialized").alias("text"),
pl.col("log_score").alias("label"),
]
)

stories = stories.with_columns(
[
pl.col("text")
.map_elements(
lambda x: tokenizer(x)["input_ids"], return_dtype=pl.List(pl.Int64)
)
.alias("input_ids"),
]
).select(["input_ids", "label"])
return Dataset.from_polars(stories)


print("Loading tokenizer and model...")
tokenizer = AutoTokenizer.from_pretrained(
base_model,
truncation=True,
padding=True,
max_length=max_length,
)

model = AutoModelForSequenceClassification.from_pretrained(
base_model,
num_labels=1, # Regression task
device_map="auto",
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
)
_apply_liger_kernel_to_instance(model=model)

model.config.pad_token_id = tokenizer.pad_token_id
tokenizer.padding_side = "right"

print("Configuring LoRA...")
model = get_peft_model(
model,
LoraConfig(
task_type="SEQ_CLS",
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
],
r=8,
lora_alpha=16,
lora_dropout=0,
),
)

print("Loading dataset...")
train_stories = create_dataset("train", 1000000, tokenizer)
validation_stories = create_dataset("val", 1000, tokenizer)


# Configure training arguments
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
learning_rate=learning_rate,
weight_decay=0,
evaluation_strategy="steps",
eval_steps=0.05,
logging_steps=100,
save_strategy="steps",
save_steps=1000,
report_to="wandb",
no_cuda=False,
bf16=True,
warmup_steps=100,
gradient_accumulation_steps=gradient_accumulation_steps,
)


print("Initializing Trainer...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_stories,
eval_dataset=validation_stories,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)

print("Starting model training...")
trainer.train()

print("Saving final model...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("Stories model training complete")
Loading

0 comments on commit 39d434d

Please sign in to comment.