kyegomez · oyi77 · May 20, 2026 · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/README_BERKAHKARYA.md b/README_BERKAHKARYA.md
@@ -0,0 +1,93 @@
+# OpenMythos-BerkahKarya
+
+> Fork of [kyegomez/OpenMythos](https://github.com/kyegomez/OpenMythos) with consumer hardware optimizations.
+
+## 🚀 What's New in This Fork
+
+### Sprint 1: INT4/INT8 Quantization + Expert Offloading ✅
+- **INT4/INT8 weight quantization** — 4x memory reduction for MoE expert layers
+- **Expert offloading** — GPU ↔ CPU ↔ NVMe memory hierarchy
+- **Consumer hardware support** — Run mythos_1b on RTX 3060 12GB
+
+### Sprint 2: LoRA Training Pipeline ✅
+- **LoRA adapters** — Fine-tune only ~0.5% of parameters
+- **Colab notebook** — Free T4 GPU training (~30-60 min)
+- **QLoRA mode** — INT4 + LoRA = 8GB VRAM
+- **Finance demo data** — Trading, business plans, ad optimization
+
+## 📦 Installation
+
+```bash
+git clone https://github.com/oyi77/OpenMythos.git
+cd OpenMythos
+pip install -e .
+```
+
+## 🎯 Quick Start
+
+### Quantized Inference (Consumer Hardware)
+```python
+from open_mythos import OpenMythos, mythos_1b
+from open_mythos.quantization import quantize_model
+from open_mythos.expert_offloader import ExpertOffloader
+
+model = OpenMythos(mythos_1b())
+model = quantize_model(model, bits=4, group_size=128)
+
+offloader = ExpertOffloader(model, gpu_experts=4, cache_experts=16)
+offloader.prepare()
+```
+
+### LoRA Fine-tuning
+```python
+from open_mythos import OpenMythos, mythos_1b
+from open_mythos.lora import LoRAConfig, apply_lora, save_lora_adapter
+
+model = OpenMythos(mythos_1b())
+model = apply_lora(model, LoRAConfig(rank=16, alpha=32))
+
+# Train on your data...
+
+save_lora_adapter(model, 'my_adapter.pt')
+```
+
+### CLI Training
+```bash
+# Standard LoRA (16GB VRAM)
+python training/lora_finetune.py --variant 1b --dataset finance
+
+# QLoRA (8GB VRAM, fits Colab free T4)
+python training/lora_finetune.py --variant 1b --dataset finance --qlora
+```
+
+## 📊 PRs to Upstream
+
+| PR | Feature | Status |
+|----|---------|--------|
+| [#74](https://github.com/kyegomez/OpenMythos/pull/74) | INT4/INT8 Quantization + Expert Offloading | Open |
+| [#75](https://github.com/kyegomez/OpenMythos/pull/75) | LoRA Training Pipeline + Colab Notebook | Open |
+
+## 🏗️ Development Roadmap
+
+- [x] Sprint 1: INT4/INT8 Quantization + Expert Offloading
+- [x] Sprint 2: LoRA Training Pipeline + Colab Notebook
+- [ ] Sprint 3: Ring Attention + KV Cache Compression (1M context)
+- [ ] Sprint 4: Finance Domain Fine-tuning
+- [ ] Sprint 5: vLLM/GGUF Export
+
+## 📝 License
+
+MIT (same as upstream)
+
+## 🤝 Contributing
+
+1. Fork this repo
+2. Create a feature branch
+3. Make your changes
+4. Submit a PR to upstream (kyegomez/OpenMythos)
+
+## 🔗 Links
+
+- [Upstream Repo](https://github.com/kyegomez/OpenMythos)
+- [HuggingFace Models](https://huggingface.co/models?search=openmythos)
+- [Original Paper](https://arxiv.org/abs/2502.05171) (Huginn/Raven)
diff --git a/examples/long_context_inference.py b/examples/long_context_inference.py
@@ -0,0 +1,177 @@
+"""
+Long-Context Inference Example for OpenMythos.
+
+Demonstrates processing 128K-1M token sequences using Ring Attention
+and KV Cache compression on consumer hardware.
+
+Usage:
+    python examples/long_context_inference.py
+"""
+
+import torch
+import time
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from open_mythos.ring_attention import RingAttention, SparseRingAttention
+from open_mythos.kv_cache import QuantizedKVCache, RingAttentionWithKVCache
+
+
+def demo_ring_attention():
+    """Demo: Ring Attention for long sequences."""
+    print("=" * 60)
+    print("Ring Attention Demo")
+    print("=" * 60)
+
+    batch_size = 1
+    num_heads = 8
+    head_dim = 64
+    chunk_size = 4096
+
+    # Test different sequence lengths
+    for seq_len in [8192, 32768, 131072]:
+        print(f"\n--- Sequence length: {seq_len:,} tokens ---")
+
+        # Create random Q, K, V
+        q = torch.randn(batch_size, seq_len, num_heads, head_dim)
+        k = torch.randn(batch_size, seq_len, num_heads, head_dim)
+        v = torch.randn(batch_size, seq_len, num_heads, head_dim)
+
+        # Ring Attention
+        ring_attn = RingAttention(
+            chunk_size=chunk_size,
+            num_heads=num_heads,
+            head_dim=head_dim,
+        )
+
+        start = time.time()
+        with torch.no_grad():
+            output = ring_attn(q, k, v)
+        elapsed = time.time() - start
+
+        print(f"  Output shape: {output.shape}")
+        print(f"  Time: {elapsed:.2f}s")
+        print(f"  Throughput: {seq_len / elapsed:.0f} tokens/sec")
+
+        # Memory estimate
+        # Standard attention: O(seq_len^2 * num_heads)
+        standard_mem = seq_len * seq_len * num_heads * 4 / 1024 / 1024  # MB
+        ring_mem = chunk_size * chunk_size * num_heads * 4 / 1024 / 1024  # MB
+        print(f"  Standard attention memory: {standard_mem:.1f} MB")
+        print(f"  Ring attention memory: {ring_mem:.1f} MB")
+        print(f"  Memory savings: {standard_mem / ring_mem:.1f}x")
+
+
+def demo_kv_cache():
+    """Demo: KV Cache compression."""
+    print("\n" + "=" * 60)
+    print("KV Cache Compression Demo")
+    print("=" * 60)
+
+    num_layers = 32
+    num_heads = 32
+    head_dim = 128
+
+    # Test different sequence lengths
+    for seq_len in [8192, 65536, 262144]:
+        print(f"\n--- Sequence length: {seq_len:,} tokens ---")
+
+        cache = QuantizedKVCache(
+            num_layers=num_layers,
+            num_heads=num_heads,
+            head_dim=head_dim,
+            max_seq_len=seq_len,
+        )
+
+        # Simulate storing KV for each layer
+        start = time.time()
+        for layer_id in range(min(num_layers, 4)):  # Test with 4 layers
+            k = torch.randn(1, seq_len, num_heads, head_dim)
+            v = torch.randn(1, seq_len, num_heads, head_dim)
+            cache.store(layer_id, k, v)
+        store_time = time.time() - start
+
+        # Retrieve
+        start = time.time()
+        for layer_id in range(min(num_layers, 4)):
+            k, v = cache.retrieve(layer_id)
+        retrieve_time = time.time() - start
+
+        stats = cache.get_stats()
+
+        print(f"  Store time (4 layers): {store_time:.2f}s")
+        print(f"  Retrieve time (4 layers): {retrieve_time:.2f}s")
+        print(f"  Compression ratio: {stats['compression_ratio']:.2f}x")
+        print(f"  Memory usage: {stats['memory_mb']:.1f} MB")
+
+        # FP16 comparison
+        fp16_mem = seq_len * num_heads * head_dim * 2 * 2 * num_layers / 1024 / 1024
+        print(f"  FP16 memory (all layers): {fp16_mem:.1f} MB")
+        print(f"  Compressed memory: {stats['memory_mb']:.1f} MB")
+
+
+def demo_sparse_attention():
+    """Demo: Sparse Ring Attention."""
+    print("\n" + "=" * 60)
+    print("Sparse Ring Attention Demo")
+    print("=" * 60)
+
+    batch_size = 1
+    num_heads = 8
+    head_dim = 64
+    seq_len = 65536
+
+    q = torch.randn(batch_size, seq_len, num_heads, head_dim)
+    k = torch.randn(batch_size, seq_len, num_heads, head_dim)
+    v = torch.randn(batch_size, seq_len, num_heads, head_dim)
+
+    sparse_attn = SparseRingAttention(
+        chunk_size=8192,
+        window_size=4096,
+        num_global_tokens=256,
+        num_heads=num_heads,
+        head_dim=head_dim,
+    )
+
+    start = time.time()
+    with torch.no_grad():
+        output = sparse_attn(q, k, v)
+    elapsed = time.time() - start
+
+    print(f"  Sequence: {seq_len:,} tokens")
+    print(f"  Output shape: {output.shape}")
+    print(f"  Time: {elapsed:.2f}s")
+    print(f"  Window size: 4096")
+    print(f"  Global tokens: 256")
+
+
+def main():
+    print("OpenMythos Long-Context Inference Demo")
+    print("Consumer Hardware Edition")
+
+    # Check if CUDA is available
+    if torch.cuda.is_available():
+        print(f"\nGPU: {torch.cuda.get_device_name(0)}")
+        print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB")
+    else:
+        print("\nRunning on CPU (demos will be slower)")
+
+    # Run demos
+    demo_ring_attention()
+    demo_kv_cache()
+    demo_sparse_attention()
+
+    print("\n" + "=" * 60)
+    print("Summary")
+    print("=" * 60)
+    print("Ring Attention: Enables 1M+ context with chunked processing")
+    print("KV Cache INT4: 4x compression for cached KV states")
+    print("Sparse Attention: Sliding window + global tokens")
+    print("\nCombined: 1M context on ~12GB VRAM (RTX 3060)")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/quantized_inference.py b/examples/quantized_inference.py
@@ -0,0 +1,93 @@
+"""
+Quantized Inference Example for OpenMythos.
+
+Demonstrates running mythos_1b with INT4 quantization and expert offloading
+on consumer hardware (RTX 3060 12GB).
+
+Usage:
+    python examples/quantized_inference.py
+"""
+
+import torch
+import time
+import sys
+import os
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from open_mythos import OpenMythos, mythos_1b
+from open_mythos.quantization import quantize_model, print_quantization_summary
+from open_mythos.expert_offloader import ExpertOffloader
+
+
+def main():
+    print("=" * 60)
+    print("OpenMythos Quantized Inference Demo")
+    print("=" * 60)
+
+    # 1. Create model
+    print("\n[1/5] Creating mythos_1b model...")
+    cfg = mythos_1b()
+    model = OpenMythos(cfg)
+    print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")
+
+    # 2. Quantize to INT4
+    print("\n[2/5] Quantizing to INT4 (expert FFN layers only)...")
+    model = quantize_model(model, bits=4, group_size=128)
+    print_quantization_summary(model)
+
+    # 3. Setup expert offloading
+    print("\n[3/5] Setting up expert offloading...")
+    print(f"  GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
+
+    if torch.cuda.is_available():
+        offloader = ExpertOffloader(
+            model,
+            gpu_experts=4,    # Keep 4 experts on GPU
+            cache_experts=16, # Keep 16 in CPU RAM
+        )
+        offloader.prepare()
+        print(f"  GPU experts: 4 | CPU cache: 16 | Disk: rest")
+    else:
+        print("  Running on CPU (no offloading needed)")
+
+    # 4. Generate text
+    print("\n[4/5] Generating text...")
+    input_ids = torch.randint(0, cfg.vocab_size, (1, 32))
+    if torch.cuda.is_available():
+        input_ids = input_ids.cuda()
+
+    # Warmup
+    _ = model.generate(input_ids, max_new_tokens=4, n_loops=2)
+
+    # Benchmark
+    start = time.time()
+    with torch.no_grad():
+        output = model.generate(input_ids, max_new_tokens=64, n_loops=4)
+    elapsed = time.time() - start
+
+    tokens_generated = output.shape[1] - input_ids.shape[1]
+    tokens_per_sec = tokens_generated / elapsed
+
+    print(f"  Generated {tokens_generated} tokens in {elapsed:.2f}s")
+    print(f"  Speed: {tokens_per_sec:.1f} tokens/sec")
+
+    # 5. Memory usage
+    print("\n[5/5] Memory usage:")
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1024 / 1024
+        reserved = torch.cuda.memory_reserved() / 1024 / 1024
+        print(f"  GPU allocated: {allocated:.1f} MB")
+        print(f"  GPU reserved:  {reserved:.1f} MB")
+
+    if torch.cuda.is_available():
+        print(f"\nOffloader stats:")
+        offloader.print_stats()
+
+    print("\n" + "=" * 60)
+    print("Done! Model runs successfully with INT4 quantization.")
+    print("=" * 60)
+
+
+if __name__ == "__main__":
+    main()