atilsamancioglu · cgn-core · Mar 1, 2026
diff --git a/dataset.py b/dataset.py
@@ -100,7 +100,7 @@ def decode(self, ids: list) -> str:
         characters = []
         for id in ids:
             characters.append(self.id_to_char[id])
-        return ''.join(characters)
+        return "".join(characters)
 
 
 def get_batch(data: torch.Tensor, block_size: int, batch_size: int):
@@ -152,8 +152,10 @@ def get_batch(data: torch.Tensor, block_size: int, batch_size: int):
     y_list = []
 
     for pos in positions:
-        x_list.append(data[pos : pos + block_size])        # Input: chars 0 to n-1
-        y_list.append(data[pos + 1 : pos + block_size + 1])  # Target: chars 1 to n (shifted by 1)
+        x_list.append(data[pos : pos + block_size])  # Input: chars 0 to n-1
+        y_list.append(
+            data[pos + 1 : pos + block_size + 1]
+        )  # Target: chars 1 to n (shifted by 1)
 
     # 3. Stack into batch tensors: (batch_size, block_size)
     x = torch.stack(x_list)
@@ -179,7 +181,7 @@ def load_data(block_size: int = 256, train_split: float = 0.9):
     download_shakespeare()
 
     # 2. Load the text file
-    with open(DATA_PATH, 'r', encoding='utf-8') as file:
+    with open(DATA_PATH, "r", encoding="utf-8") as file:
         text = file.read()
 
     print(f"\nDataset size: {len(text):,} characters")

diff --git a/generate.py b/generate.py
@@ -18,6 +18,7 @@
 # Load Model
 # ==============================================================================
 
+
 def load_model(checkpoint_path: str = "checkpoints/model.pt"):
     """
     Load a trained GPT model from checkpoint.
@@ -39,26 +40,26 @@ def load_model(checkpoint_path: str = "checkpoints/model.pt"):
     checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
 
     # 3. Get the model configuration that was saved during training
-    config = checkpoint['config']
+    config = checkpoint["config"]
 
     # 4. Create the tokenizer (we need the same vocabulary as training)
     download_shakespeare()
-    with open(DATA_PATH, 'r', encoding='utf-8') as file:
+    with open(DATA_PATH, "r", encoding="utf-8") as file:
         text = file.read()
     tokenizer = CharacterTokenizer(text)
 
     # 5. Create the model with the saved configuration
     model = GPT(
-        vocab_size=config['vocab_size'],
-        embedding_dim=config['embedding_dim'],
-        num_heads=config['num_heads'],
-        num_layers=config['num_layers'],
-        block_size=config['block_size'],
-        dropout=0.0  # No dropout during generation
+        vocab_size=config["vocab_size"],
+        embedding_dim=config["embedding_dim"],
+        num_heads=config["num_heads"],
+        num_layers=config["num_layers"],
+        block_size=config["block_size"],
+        dropout=0.0,  # No dropout during generation
     )
 
     # 6. Load the trained weights into the model
-    model.load_state_dict(checkpoint['model_state_dict'])
+    model.load_state_dict(checkpoint["model_state_dict"])
 
     # 7. Move model to device and set to evaluation mode
     model = model.to(device)
@@ -76,8 +77,16 @@ def load_model(checkpoint_path: str = "checkpoints/model.pt"):
 # Generate Text
 # ==============================================================================
 
+
 @torch.no_grad()
-def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, temperature: float = 0.8):
+def generate(
+    model,
+    tokenizer,
+    device,
+    prompt: str,
+    max_tokens: int = 500,
+    temperature: float = 0.8,
+):
     """
     Generate text given a starting prompt.
 
@@ -103,13 +112,13 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
     # 1. Convert prompt text to token IDs
     prompt_ids = tokenizer.encode(prompt)
     input_ids = torch.tensor(prompt_ids, dtype=torch.long, device=device)
-    input_ids = input_ids.unsqueeze(0)  # Add batch dimension: shape becomes (1, seq_len)
+    input_ids = input_ids.unsqueeze(
+        0
+    )  # Add batch dimension: shape becomes (1, seq_len)
 
     # 2. Generate new tokens using the model's generate method
     output_ids = model.generate(
-        input_ids=input_ids,
-        max_new_tokens=max_tokens,
-        temperature=temperature
+        input_ids=input_ids, max_new_tokens=max_tokens, temperature=temperature
     )
 
     # 3. Convert token IDs back to text
@@ -123,7 +132,6 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
 # ==============================================================================
 
 if __name__ == "__main__":
-
     # 1. Load the trained model
     print("=" * 60)
     print("Shakespeare GPT - Text Generation")
@@ -152,7 +160,7 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
             device=device,
             prompt=prompt,
             max_tokens=300,
-            temperature=0.8
+            temperature=0.8,
         )
 
         print(generated_text)
@@ -169,7 +177,7 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
         try:
             prompt = input("\nYour prompt: ")
 
-            if prompt.lower() in ['quit', 'exit', 'q']:
+            if prompt.lower() in ["quit", "exit", "q"]:
                 print("Farewell!")
                 break
 
@@ -182,7 +190,7 @@ def generate(model, tokenizer, device, prompt: str, max_tokens: int = 500, tempe
                 device=device,
                 prompt=prompt,
                 max_tokens=500,
-                temperature=0.8
+                temperature=0.8,
             )
 
             print("\n" + generated_text)

diff --git a/model.py b/model.py
@@ -75,10 +75,9 @@ class TransformerBlock(nn.Module):
     # The key relationship: head_size = embedding_dim / num_heads = 64
     # This head_size=64 is consistent across most GPT models.
     # -------------------------------------------------------------------------
-    def __init__(self,
-                 embedding_dim: int = 384,
-                 num_heads: int = 6,
-                 dropout: float = 0.1):
+    def __init__(
+        self, embedding_dim: int = 384, num_heads: int = 6, dropout: float = 0.1
+    ):
         super().__init__()
 
         # 2. Create the first Layer Normalization
@@ -98,7 +97,7 @@ def __init__(self,
             embed_dim=embedding_dim,
             num_heads=num_heads,
             dropout=dropout,
-            batch_first=True  # Input shape: (batch, sequence, embedding)
+            batch_first=True,  # Input shape: (batch, sequence, embedding)
         )
 
         # 4. Create the second Layer Normalization
@@ -117,9 +116,9 @@ def __init__(self,
         # patterns, then compresses back to the original size.
         self.mlp = nn.Sequential(
             nn.Linear(embedding_dim, 4 * embedding_dim),  # Expand: 384 → 1536
-            nn.GELU(),                                     # Activation function
+            nn.GELU(),  # Activation function
             nn.Linear(4 * embedding_dim, embedding_dim),  # Project back: 1536 → 384
-            nn.Dropout(dropout)                            # Regularization
+            nn.Dropout(dropout),  # Regularization
         )
 
     # 6. Create the forward method
@@ -140,7 +139,7 @@ def forward(self, x: torch.Tensor, causal_mask: torch.Tensor) -> torch.Tensor:
             key=x_norm,
             value=x_norm,
             attn_mask=causal_mask,
-            is_causal=False  # We provide our own mask
+            is_causal=False,  # We provide our own mask
         )
         x = x + attn_output  # Residual connection
 
@@ -165,13 +164,15 @@ class GPT(nn.Module):
     """
 
     # 1. Initialize the class with hyperparameters
-    def __init__(self,
-                 vocab_size: int,
-                 embedding_dim: int = 384,
-                 num_heads: int = 6,
-                 num_layers: int = 6,
-                 block_size: int = 256,
-                 dropout: float = 0.1):
+    def __init__(
+        self,
+        vocab_size: int,
+        embedding_dim: int = 384,
+        num_heads: int = 6,
+        num_layers: int = 6,
+        block_size: int = 256,
+        dropout: float = 0.1,
+    ):
         super().__init__()
 
         # 2. Store block_size for generation
@@ -210,30 +211,28 @@ def __init__(self,
         #   (e.g., start of sentence behaves differently from middle)
         # -----------------------------------------------------------------------
         self.token_embedding = nn.Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=embedding_dim
+            num_embeddings=vocab_size, embedding_dim=embedding_dim
         )
 
         # 4. Create Position Embedding layer
         # Each position (0, 1, 2, ..., block_size-1) gets its own learnable vector
         # Same as nn.Parameter(torch.randn(block_size, embedding_dim)) in ViT
         self.position_embedding = nn.Embedding(
-            num_embeddings=block_size,
-            embedding_dim=embedding_dim
+            num_embeddings=block_size, embedding_dim=embedding_dim
         )
 
         # 5. Create Embedding Dropout
         self.dropout = nn.Dropout(dropout)
 
         # 6. Create stack of Transformer Blocks
-        self.blocks = nn.ModuleList([
-            TransformerBlock(
-                embedding_dim=embedding_dim,
-                num_heads=num_heads,
-                dropout=dropout
-            )
-            for _ in range(num_layers)
-        ])
+        self.blocks = nn.ModuleList(
+            [
+                TransformerBlock(
+                    embedding_dim=embedding_dim, num_heads=num_heads, dropout=dropout
+                )
+                for _ in range(num_layers)
+            ]
+        )
 
         # 7. Create Final Layer Normalization
         self.ln_final = nn.LayerNorm(embedding_dim)
@@ -285,8 +284,7 @@ def __init__(self,
         # torch.triu creates an upper triangular matrix of True values.
         # True = masked (blocked), False = allowed to attend
         causal_mask = torch.triu(
-            torch.ones(block_size, block_size, dtype=torch.bool),
-            diagonal=1
+            torch.ones(block_size, block_size, dtype=torch.bool), diagonal=1
         )
         # register_buffer: saves tensor with model & moves it to GPU with model,
         # but it's NOT a learnable parameter (optimizer won't update it)
@@ -320,9 +318,7 @@ def _init_weights(self, module):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
 
     # 12. Create the forward method
-    def forward(self,
-                input_ids: torch.Tensor,
-                targets: torch.Tensor = None) -> tuple:
+    def forward(self, input_ids: torch.Tensor, targets: torch.Tensor = None) -> tuple:
         """
         Forward pass of the GPT model.
 
@@ -397,7 +393,9 @@ def forward(self,
 
     # 21. Generate method for text generation
     @torch.no_grad()
-    def generate(self, input_ids: torch.Tensor, max_new_tokens: int, temperature: float = 1.0):
+    def generate(
+        self, input_ids: torch.Tensor, max_new_tokens: int, temperature: float = 1.0
+    ):
         """
         Generate new tokens one at a time (autoregressive generation).
 
@@ -410,7 +408,6 @@ def generate(self, input_ids: torch.Tensor, max_new_tokens: int, temperature: fl
             temperature: Controls randomness (0.5=predictable, 1.0=normal, 1.5=creative)
         """
         for _ in range(max_new_tokens):
-
             # 22. If sequence is longer than block_size, crop to last block_size tokens
             # .size(1) gets the sequence length (dim 0=batch, dim 1=sequence)
             # -self.block_size uses negative indexing to take the LAST 256 tokens
@@ -419,7 +416,7 @@ def generate(self, input_ids: torch.Tensor, max_new_tokens: int, temperature: fl
             if input_ids.size(1) <= self.block_size:
                 current_input = input_ids
             else:
-                current_input = input_ids[:, -self.block_size:]
+                current_input = input_ids[:, -self.block_size :]
 
             # 23. Get model predictions
             logits, _ = self.forward(current_input)
@@ -457,11 +454,7 @@ def generate(self, input_ids: torch.Tensor, max_new_tokens: int, temperature: fl
 
     # 2. Create model
     model = GPT(
-        vocab_size=65,
-        embedding_dim=384,
-        num_heads=6,
-        num_layers=6,
-        block_size=256
+        vocab_size=65, embedding_dim=384, num_heads=6, num_layers=6, block_size=256
     ).to(device)
 
     # 3. Create dummy input