Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def decode(self, ids: list) -> str:
characters = []
for id in ids:
characters.append(self.id_to_char[id])
return ''.join(characters)
return "".join(characters)


def get_batch(data: torch.Tensor, block_size: int, batch_size: int):
Expand Down Expand Up @@ -152,8 +152,10 @@ def get_batch(data: torch.Tensor, block_size: int, batch_size: int):
y_list = []

for pos in positions:
x_list.append(data[pos : pos + block_size]) # Input: chars 0 to n-1
y_list.append(data[pos + 1 : pos + block_size + 1]) # Target: chars 1 to n (shifted by 1)
x_list.append(data[pos : pos + block_size]) # Input: chars 0 to n-1
y_list.append(
data[pos + 1 : pos + block_size + 1]
) # Target: chars 1 to n (shifted by 1)

# 3. Stack into batch tensors: (batch_size, block_size)
x = torch.stack(x_list)
Expand All @@ -179,7 +181,7 @@ def load_data(block_size: int = 256, train_split: float = 0.9):
download_shakespeare()

# 2. Load the text file
with open(DATA_PATH, 'r', encoding='utf-8') as file:
with open(DATA_PATH, "r", encoding="utf-8") as file:
text = file.read()

print(f"\nDataset size: {len(text):,} characters")
Expand Down
44 changes: 26 additions & 18 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
# Load Model
# ==============================================================================


def load_model(checkpoint_path: str = "checkpoints/model.pt"):
"""
Load a trained GPT model from checkpoint.
Expand All @@ -39,26 +40,26 @@ def load_model(checkpoint_path: str = "checkpoints/model.pt"):
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

# 3. Get the model configuration that was saved during training
config = checkpoint['config']
config = checkpoint["config"]

# 4. Create the tokenizer (we need the same vocabulary as training)
download_shakespeare()
with open(DATA_PATH, 'r', encoding='utf-8') as file:
with open(DATA_PATH, "r", encoding="utf-8") as file:
text = file.read()
tokenizer = CharacterTokenizer(text)

# 5. Create the model with the saved configuration
model = GPT(
vocab_size=config['vocab_size'],
embedding_dim=config['embedding_dim'],
num_heads=config['num_heads'],
num_layers=config['num_layers'],
block_size=config['block_size'],
dropout=0.0 # No dropout during generation
vocab_size=config["vocab_size"],
embedding_dim=config["embedding_dim"],
num_heads=config["num_heads"],
num_layers=config["num_layers"],
block_size=config["block_size"],
dropout=0.0, # No dropout during generation
)

# 6. Load the trained weights into the model
model.load_state_dict(checkpoint['model_state_dict'])
model.load_state_dict(checkpoint["model_state_dict"])

# 7. Move model to device and set to evaluation mode
model = model.to(device)
Expand All @@ -76,8 +77,16 @@ def load_model(checkpoint_path: str = "checkpoints/model.pt"):
# Generate Text
# ==============================================================================


@torch.no_grad()
def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, temperature: float = 0.8):
def generate(
model,
tokenizer,
device,
prompt: str,
max_tokens: int = 500,
temperature: float = 0.8,
):
"""
Generate text given a starting prompt.

Expand All @@ -103,13 +112,13 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
# 1. Convert prompt text to token IDs
prompt_ids = tokenizer.encode(prompt)
input_ids = torch.tensor(prompt_ids, dtype=torch.long, device=device)
input_ids = input_ids.unsqueeze(0) # Add batch dimension: shape becomes (1, seq_len)
input_ids = input_ids.unsqueeze(
0
) # Add batch dimension: shape becomes (1, seq_len)

# 2. Generate new tokens using the model's generate method
output_ids = model.generate(
input_ids=input_ids,
max_new_tokens=max_tokens,
temperature=temperature
input_ids=input_ids, max_new_tokens=max_tokens, temperature=temperature
)

# 3. Convert token IDs back to text
Expand All @@ -123,7 +132,6 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
# ==============================================================================

if __name__ == "__main__":

# 1. Load the trained model
print("=" * 60)
print("Shakespeare GPT - Text Generation")
Expand Down Expand Up @@ -152,7 +160,7 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
device=device,
prompt=prompt,
max_tokens=300,
temperature=0.8
temperature=0.8,
)

print(generated_text)
Expand All @@ -169,7 +177,7 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
try:
prompt = input("\nYour prompt: ")

if prompt.lower() in ['quit', 'exit', 'q']:
if prompt.lower() in ["quit", "exit", "q"]:
print("Farewell!")
break

Expand All @@ -182,7 +190,7 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
device=device,
prompt=prompt,
max_tokens=500,
temperature=0.8
temperature=0.8,
)

print("\n" + generated_text)
Expand Down
73 changes: 33 additions & 40 deletions model.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,9 @@ class TransformerBlock(nn.Module):
# The key relationship: head_size = embedding_dim / num_heads = 64
# This head_size=64 is consistent across most GPT models.
# -------------------------------------------------------------------------
def __init__(self,
embedding_dim: int = 384,
num_heads: int = 6,
dropout: float = 0.1):
def __init__(
self, embedding_dim: int = 384, num_heads: int = 6, dropout: float = 0.1
):
super().__init__()

# 2. Create the first Layer Normalization
Expand All @@ -98,7 +97,7 @@ def __init__(self,
embed_dim=embedding_dim,
num_heads=num_heads,
dropout=dropout,
batch_first=True # Input shape: (batch, sequence, embedding)
batch_first=True, # Input shape: (batch, sequence, embedding)
)

# 4. Create the second Layer Normalization
Expand All @@ -117,9 +116,9 @@ def __init__(self,
# patterns, then compresses back to the original size.
self.mlp = nn.Sequential(
nn.Linear(embedding_dim, 4 * embedding_dim), # Expand: 384 → 1536
nn.GELU(), # Activation function
nn.GELU(), # Activation function
nn.Linear(4 * embedding_dim, embedding_dim), # Project back: 1536 → 384
nn.Dropout(dropout) # Regularization
nn.Dropout(dropout), # Regularization
)

# 6. Create the forward method
Expand All @@ -140,7 +139,7 @@ def forward(self, x: torch.Tensor, causal_mask: torch.Tensor) -> torch.Tensor:
key=x_norm,
value=x_norm,
attn_mask=causal_mask,
is_causal=False # We provide our own mask
is_causal=False, # We provide our own mask
)
x = x + attn_output # Residual connection

Expand All @@ -165,13 +164,15 @@ class GPT(nn.Module):
"""

# 1. Initialize the class with hyperparameters
def __init__(self,
vocab_size: int,
embedding_dim: int = 384,
num_heads: int = 6,
num_layers: int = 6,
block_size: int = 256,
dropout: float = 0.1):
def __init__(
self,
vocab_size: int,
embedding_dim: int = 384,
num_heads: int = 6,
num_layers: int = 6,
block_size: int = 256,
dropout: float = 0.1,
):
super().__init__()

# 2. Store block_size for generation
Expand Down Expand Up @@ -210,30 +211,28 @@ def __init__(self,
# (e.g., start of sentence behaves differently from middle)
# -----------------------------------------------------------------------
self.token_embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=embedding_dim
num_embeddings=vocab_size, embedding_dim=embedding_dim
)

# 4. Create Position Embedding layer
# Each position (0, 1, 2, ..., block_size-1) gets its own learnable vector
# Same as nn.Parameter(torch.randn(block_size, embedding_dim)) in ViT
self.position_embedding = nn.Embedding(
num_embeddings=block_size,
embedding_dim=embedding_dim
num_embeddings=block_size, embedding_dim=embedding_dim
)

# 5. Create Embedding Dropout
self.dropout = nn.Dropout(dropout)

# 6. Create stack of Transformer Blocks
self.blocks = nn.ModuleList([
TransformerBlock(
embedding_dim=embedding_dim,
num_heads=num_heads,
dropout=dropout
)
for _ in range(num_layers)
])
self.blocks = nn.ModuleList(
[
TransformerBlock(
embedding_dim=embedding_dim, num_heads=num_heads, dropout=dropout
)
for _ in range(num_layers)
]
)

# 7. Create Final Layer Normalization
self.ln_final = nn.LayerNorm(embedding_dim)
Expand Down Expand Up @@ -285,8 +284,7 @@ def __init__(self,
# torch.triu creates an upper triangular matrix of True values.
# True = masked (blocked), False = allowed to attend
causal_mask = torch.triu(
torch.ones(block_size, block_size, dtype=torch.bool),
diagonal=1
torch.ones(block_size, block_size, dtype=torch.bool), diagonal=1
)
# register_buffer: saves tensor with model & moves it to GPU with model,
# but it's NOT a learnable parameter (optimizer won't update it)
Expand Down Expand Up @@ -320,9 +318,7 @@ def _init_weights(self, module):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

# 12. Create the forward method
def forward(self,
input_ids: torch.Tensor,
targets: torch.Tensor = None) -> tuple:
def forward(self, input_ids: torch.Tensor, targets: torch.Tensor = None) -> tuple:
"""
Forward pass of the GPT model.

Expand Down Expand Up @@ -397,7 +393,9 @@ def forward(self,

# 21. Generate method for text generation
@torch.no_grad()
def generate(self, input_ids: torch.Tensor, max_new_tokens: int, temperature: float = 1.0):
def generate(
self, input_ids: torch.Tensor, max_new_tokens: int, temperature: float = 1.0
):
"""
Generate new tokens one at a time (autoregressive generation).

Expand All @@ -410,7 +408,6 @@ def generate(self, input_ids: torch.Tensor, max_new_tokens: int, temperature: fl
temperature: Controls randomness (0.5=predictable, 1.0=normal, 1.5=creative)
"""
for _ in range(max_new_tokens):

# 22. If sequence is longer than block_size, crop to last block_size tokens
# .size(1) gets the sequence length (dim 0=batch, dim 1=sequence)
# -self.block_size uses negative indexing to take the LAST 256 tokens
Expand All @@ -419,7 +416,7 @@ def generate(self, input_ids: torch.Tensor, max_new_tokens: int, temperature: fl
if input_ids.size(1) <= self.block_size:
current_input = input_ids
else:
current_input = input_ids[:, -self.block_size:]
current_input = input_ids[:, -self.block_size :]

# 23. Get model predictions
logits, _ = self.forward(current_input)
Expand Down Expand Up @@ -457,11 +454,7 @@ def generate(self, input_ids: torch.Tensor, max_new_tokens: int, temperature: fl

# 2. Create model
model = GPT(
vocab_size=65,
embedding_dim=384,
num_heads=6,
num_layers=6,
block_size=256
vocab_size=65, embedding_dim=384, num_heads=6, num_layers=6, block_size=256
).to(device)

# 3. Create dummy input
Expand Down
Loading