kyegomez · 0riginal-claw · May 16, 2026 · May 16, 2026 · May 16, 2026 · May 16, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,23 @@
+name: CI
+
+on: [push, pull_request]
+
+jobs:
+  smoke:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install package and test dependencies
+        run: |
+          pip install torch --index-url https://download.pytorch.org/whl/cpu
+          pip install -e .
+          pip install pytest
+
+      - name: Run tests
+        run: pytest tests/ -v --tb=short
diff --git a/README.md b/README.md
@@ -46,6 +46,12 @@ To enable Flash Attention 2 in `GQAttention` (requires CUDA and build tools):
 pip install open-mythos[flash]
 ```
 
+> **Optional speedup:** `flash-attn` delivers roughly 30% throughput improvement on
+> CUDA hardware. If it is absent (Intel Mac, ROCm, CPU-only, or missing build
+> tools) the model automatically falls back to standard scaled-dot-product
+> attention — no code change required. All functionality is identical; only
+> throughput differs.
+
 ## Usage
 
 ```python
@@ -96,8 +102,12 @@ print(f"[{attn_type.upper()}] Logits shape: {logits.shape}")
 out = model.generate(ids, max_new_tokens=8, n_loops=8)
 print(f"[{attn_type.upper()}] Generated shape: {out.shape}")
 
-A = model.recurrent.injection.get_A()
-rho = torch.linalg.eigvals(A).abs().max().item()
+A_diag = model.recurrent.injection.get_A()
+# get_A() returns the diagonal of A_discrete as a 1-D tensor.
+# For a diagonal matrix eigenvalues == diagonal entries, so spectral
+# radius = max(|diag|).  torch.linalg.eigvals requires a 2-D input,
+# so we use abs().max() on the 1-D vector directly — faster and correct.
+rho = A_diag.abs().max().item()
 print(
     f"[{attn_type.upper()}] Spectral radius ρ(A) = {rho:.4f} (must be < 1)"
 )

diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md
@@ -0,0 +1,126 @@
+# Getting Started with OpenMythos
+
+Get OpenMythos running in under 5 minutes on any machine, including CPU-only
+laptops and Intel Macs.
+
+## 1. Install
+
+```bash
+pip install open-mythos
+```
+
+**Note on flash-attn:** `flash-attn` is optional and only available on CUDA
+hardware. The model falls back to standard scaled-dot-product attention
+automatically when it is absent — no code change needed.
+
+For CUDA machines that want the ~30% throughput speedup:
+
+```bash
+pip install open-mythos[flash]
+```
+
+## 2. Instant smoke test with TINY_CONFIG
+
+`TINY_CONFIG` is a pre-built preset (~1-3M parameters, CPU-safe) included for
+quick verification that your install is working.
+
+```python
+import torch
+from open_mythos.main import OpenMythos
+from open_mythos.variants import TINY_CONFIG
+
+# 1. Instantiate the model
+model = OpenMythos(TINY_CONFIG)
+model.eval()
+print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
+
+# 2. Create a random input batch  (batch=1, sequence_length=16)
+ids = torch.randint(0, TINY_CONFIG.vocab_size, (1, 16))
+
+# 3. Run a forward pass (2 recurrent loop iterations)
+with torch.no_grad():
+    logits = model(ids, n_loops=TINY_CONFIG.max_loop_iters)
+
+# 4. Verify output shape: (batch, seq_len, vocab_size)
+print(f"Output shape: {list(logits.shape)}")
+assert logits.shape == (1, 16, TINY_CONFIG.vocab_size)
+
+# 5. Verify LTI stability: spectral radius of A must be < 1
+A_diag = model.recurrent.injection.get_A()
+rho = A_diag.abs().max().item()
+print(f"Spectral radius rho(A) = {rho:.6f}  (must be < 1)")
+assert rho < 1.0
+
+# 6. Quick backward pass check
+model.train()
+logits2 = model(ids, n_loops=TINY_CONFIG.max_loop_iters)
+logits2.mean().backward()
+grad_norm = sum(
+    p.grad.norm().item() ** 2 for p in model.parameters() if p.grad is not None
+) ** 0.5
+print(f"Gradient norm: {grad_norm:.4f}  (should be > 0)")
+assert grad_norm > 0
+
+print("All checks passed.")
+```
+
+Expected output (exact numbers will vary):
+
+```
+Parameters: 1,234,567
+Output shape: [1, 16, 1000]
+Spectral radius rho(A) = 0.412345  (must be < 1)
+Gradient norm: 3.2109  (should be > 0)
+All checks passed.
+```
+
+## 3. Use a production-scale config
+
+Switch from `TINY_CONFIG` to one of the named presets for real experiments:
+
+```python
+from open_mythos.variants import mythos_1b
+from open_mythos.main import OpenMythos
+
+cfg = mythos_1b()          # 1B-parameter research config
+model = OpenMythos(cfg)
+```
+
+See [variants.py](../open_mythos/variants.py) for the full list:
+`mythos_1b`, `mythos_3b`, `mythos_10b`, `mythos_50b`, `mythos_100b`,
+`mythos_500b`, `mythos_1t`.
+
+## 4. Generate text
+
+```python
+import torch
+from open_mythos.main import OpenMythos
+from open_mythos.variants import TINY_CONFIG
+
+model = OpenMythos(TINY_CONFIG)
+model.eval()
+
+prompt = torch.randint(0, TINY_CONFIG.vocab_size, (1, 8))
+with torch.no_grad():
+    output = model.generate(prompt, max_new_tokens=16, n_loops=4)
+
+print(f"Generated shape: {list(output.shape)}")  # [1, 24]
+```
+
+## 5. Run the test suite
+
+```bash
+pip install pytest
+pytest tests/ -v
+```
+
+The tests use `TINY_CONFIG` so they complete in seconds on CPU.
+
+## Troubleshooting
+
+| Symptom | Fix |
+|---|---|
+| `pip install` fails with `torch==2.11.0 not found` | Upgrade to `open-mythos>=0.5.1`; the pin is fixed |
+| `ImportError: flash_attn` | Not needed — the model falls back automatically. Only install it on CUDA. |
+| `RuntimeError: eigvals expects 2-D input` | Use `get_A().abs().max()` as shown above, not `torch.linalg.eigvals(get_A())` |
+| Tests fail with OOM | Tests use `TINY_CONFIG` and should fit in < 200 MB RAM |
diff --git a/open_mythos/__init__.py b/open_mythos/__init__.py
@@ -17,13 +17,15 @@
 )
 from open_mythos.tokenizer import MythosTokenizer
 from open_mythos.variants import (
+    TINY_CONFIG,
     mythos_1b,
     mythos_1t,
     mythos_3b,
     mythos_10b,
     mythos_50b,
     mythos_100b,
     mythos_500b,
+    mythos_tiny,
 )
 
 __all__ = [
@@ -42,6 +44,8 @@
     "precompute_rope_freqs",
     "apply_rope",
     "loop_index_embedding",
+    "mythos_tiny",
+    "TINY_CONFIG",
     "mythos_1b",
     "mythos_3b",
     "mythos_10b",

diff --git a/open_mythos/variants.py b/open_mythos/variants.py
@@ -1,5 +1,57 @@
 from open_mythos.main import MythosConfig
 
+# ---------------------------------------------------------------------------
+# TINY_CONFIG — unit testing + CPU smoke testing only
+# ---------------------------------------------------------------------------
+
+
+def mythos_tiny() -> MythosConfig:
+    """Tiny config for unit tests and CPU smoke tests.
+
+    Targets roughly 1-3M parameters so that tests complete in seconds on
+    any machine without a GPU.  Not suitable for training or inference on
+    real text — use mythos_1b() or larger for that.
+
+    Architecture choices:
+    - dim=128, n_heads=4 (head_dim=32, fits comfortably in CPU cache)
+    - MLA attention with minimal lora ranks
+    - n_experts=2, n_shared_experts=1, n_experts_per_tok=1 (minimal MoE)
+    - recurrent_iters (max_loop_iters)=2
+    - prelude_layers=1, coda_layers=1
+    - vocab_size=1000, max_seq_len=64
+    """
+    return MythosConfig(
+        vocab_size=1000,
+        dim=128,
+        n_heads=4,
+        n_kv_heads=4,
+        max_seq_len=64,
+        max_loop_iters=2,
+        prelude_layers=1,
+        coda_layers=1,
+        attn_type="mla",
+        kv_lora_rank=32,
+        q_lora_rank=64,
+        qk_rope_head_dim=16,
+        qk_nope_head_dim=16,
+        v_head_dim=16,
+        n_experts=2,
+        n_shared_experts=1,
+        n_experts_per_tok=1,
+        expert_dim=32,
+        act_threshold=0.99,
+        rope_theta=10000.0,
+        lora_rank=4,
+        dropout=0.0,
+    )
+
+
+# ---------------------------------------------------------------------------
+# TINY_CONFIG singleton — import directly for convenience in tests
+# ---------------------------------------------------------------------------
+
+TINY_CONFIG = mythos_tiny()
+
 # Parameter budget breakdown per variant:
 #   total ≈ embed + prelude/coda dense blocks + recurrent MLA + MoE
 #   MoE   = 3 * dim * expert_dim * (n_experts + n_shared * n_experts_per_tok)

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,7 +38,7 @@ classifiers = [
 
 [tool.poetry.dependencies]
 python = ">=3.10,<4.0"
-torch = "2.11.0"
+torch = ">=2.1"
 transformers = ">=4.40.0"
 datasets = ">=2.18.0"