Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions README_BERKAHKARYA.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# OpenMythos-BerkahKarya

> Fork of [kyegomez/OpenMythos](https://github.com/kyegomez/OpenMythos) with consumer hardware optimizations.

## 🚀 What's New in This Fork

### Sprint 1: INT4/INT8 Quantization + Expert Offloading ✅
- **INT4/INT8 weight quantization** — 4x memory reduction for MoE expert layers
- **Expert offloading** — GPU ↔ CPU ↔ NVMe memory hierarchy
- **Consumer hardware support** — Run mythos_1b on RTX 3060 12GB

### Sprint 2: LoRA Training Pipeline ✅
- **LoRA adapters** — Fine-tune only ~0.5% of parameters
- **Colab notebook** — Free T4 GPU training (~30-60 min)
- **QLoRA mode** — INT4 + LoRA = 8GB VRAM
- **Finance demo data** — Trading, business plans, ad optimization

## 📦 Installation

```bash
git clone https://github.com/oyi77/OpenMythos.git
cd OpenMythos
pip install -e .
```

## 🎯 Quick Start

### Quantized Inference (Consumer Hardware)
```python
from open_mythos import OpenMythos, mythos_1b
from open_mythos.quantization import quantize_model
from open_mythos.expert_offloader import ExpertOffloader

model = OpenMythos(mythos_1b())
model = quantize_model(model, bits=4, group_size=128)

offloader = ExpertOffloader(model, gpu_experts=4, cache_experts=16)
offloader.prepare()
```

### LoRA Fine-tuning
```python
from open_mythos import OpenMythos, mythos_1b
from open_mythos.lora import LoRAConfig, apply_lora, save_lora_adapter

model = OpenMythos(mythos_1b())
model = apply_lora(model, LoRAConfig(rank=16, alpha=32))

# Train on your data...

save_lora_adapter(model, 'my_adapter.pt')
```

### CLI Training
```bash
# Standard LoRA (16GB VRAM)
python training/lora_finetune.py --variant 1b --dataset finance

# QLoRA (8GB VRAM, fits Colab free T4)
python training/lora_finetune.py --variant 1b --dataset finance --qlora
```

## 📊 PRs to Upstream

| PR | Feature | Status |
|----|---------|--------|
| [#74](https://github.com/kyegomez/OpenMythos/pull/74) | INT4/INT8 Quantization + Expert Offloading | Open |
| [#75](https://github.com/kyegomez/OpenMythos/pull/75) | LoRA Training Pipeline + Colab Notebook | Open |

## 🏗️ Development Roadmap

- [x] Sprint 1: INT4/INT8 Quantization + Expert Offloading
- [x] Sprint 2: LoRA Training Pipeline + Colab Notebook
- [ ] Sprint 3: Ring Attention + KV Cache Compression (1M context)
- [ ] Sprint 4: Finance Domain Fine-tuning
- [ ] Sprint 5: vLLM/GGUF Export

## 📝 License

MIT (same as upstream)

## 🤝 Contributing

1. Fork this repo
2. Create a feature branch
3. Make your changes
4. Submit a PR to upstream (kyegomez/OpenMythos)

## 🔗 Links

- [Upstream Repo](https://github.com/kyegomez/OpenMythos)
- [HuggingFace Models](https://huggingface.co/models?search=openmythos)
- [Original Paper](https://arxiv.org/abs/2502.05171) (Huginn/Raven)
177 changes: 177 additions & 0 deletions examples/long_context_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
"""
Long-Context Inference Example for OpenMythos.

Demonstrates processing 128K-1M token sequences using Ring Attention
and KV Cache compression on consumer hardware.

Usage:
python examples/long_context_inference.py
"""

import torch
import time
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from open_mythos.ring_attention import RingAttention, SparseRingAttention
from open_mythos.kv_cache import QuantizedKVCache, RingAttentionWithKVCache


def demo_ring_attention():
"""Demo: Ring Attention for long sequences."""
print("=" * 60)
print("Ring Attention Demo")
print("=" * 60)

batch_size = 1
num_heads = 8
head_dim = 64
chunk_size = 4096

# Test different sequence lengths
for seq_len in [8192, 32768, 131072]:
print(f"\n--- Sequence length: {seq_len:,} tokens ---")

# Create random Q, K, V
q = torch.randn(batch_size, seq_len, num_heads, head_dim)
k = torch.randn(batch_size, seq_len, num_heads, head_dim)
v = torch.randn(batch_size, seq_len, num_heads, head_dim)

# Ring Attention
ring_attn = RingAttention(
chunk_size=chunk_size,
num_heads=num_heads,
head_dim=head_dim,
)

start = time.time()
with torch.no_grad():
output = ring_attn(q, k, v)
elapsed = time.time() - start

print(f" Output shape: {output.shape}")
print(f" Time: {elapsed:.2f}s")
print(f" Throughput: {seq_len / elapsed:.0f} tokens/sec")

# Memory estimate
# Standard attention: O(seq_len^2 * num_heads)
standard_mem = seq_len * seq_len * num_heads * 4 / 1024 / 1024 # MB
ring_mem = chunk_size * chunk_size * num_heads * 4 / 1024 / 1024 # MB
print(f" Standard attention memory: {standard_mem:.1f} MB")
print(f" Ring attention memory: {ring_mem:.1f} MB")
print(f" Memory savings: {standard_mem / ring_mem:.1f}x")


def demo_kv_cache():
"""Demo: KV Cache compression."""
print("\n" + "=" * 60)
print("KV Cache Compression Demo")
print("=" * 60)

num_layers = 32
num_heads = 32
head_dim = 128

# Test different sequence lengths
for seq_len in [8192, 65536, 262144]:
print(f"\n--- Sequence length: {seq_len:,} tokens ---")

cache = QuantizedKVCache(
num_layers=num_layers,
num_heads=num_heads,
head_dim=head_dim,
max_seq_len=seq_len,
)

# Simulate storing KV for each layer
start = time.time()
for layer_id in range(min(num_layers, 4)): # Test with 4 layers
k = torch.randn(1, seq_len, num_heads, head_dim)
v = torch.randn(1, seq_len, num_heads, head_dim)
cache.store(layer_id, k, v)
store_time = time.time() - start

# Retrieve
start = time.time()
for layer_id in range(min(num_layers, 4)):
k, v = cache.retrieve(layer_id)
retrieve_time = time.time() - start

stats = cache.get_stats()

print(f" Store time (4 layers): {store_time:.2f}s")
print(f" Retrieve time (4 layers): {retrieve_time:.2f}s")
print(f" Compression ratio: {stats['compression_ratio']:.2f}x")
print(f" Memory usage: {stats['memory_mb']:.1f} MB")

# FP16 comparison
fp16_mem = seq_len * num_heads * head_dim * 2 * 2 * num_layers / 1024 / 1024
print(f" FP16 memory (all layers): {fp16_mem:.1f} MB")
print(f" Compressed memory: {stats['memory_mb']:.1f} MB")


def demo_sparse_attention():
"""Demo: Sparse Ring Attention."""
print("\n" + "=" * 60)
print("Sparse Ring Attention Demo")
print("=" * 60)

batch_size = 1
num_heads = 8
head_dim = 64
seq_len = 65536

q = torch.randn(batch_size, seq_len, num_heads, head_dim)
k = torch.randn(batch_size, seq_len, num_heads, head_dim)
v = torch.randn(batch_size, seq_len, num_heads, head_dim)

sparse_attn = SparseRingAttention(
chunk_size=8192,
window_size=4096,
num_global_tokens=256,
num_heads=num_heads,
head_dim=head_dim,
)

start = time.time()
with torch.no_grad():
output = sparse_attn(q, k, v)
elapsed = time.time() - start

print(f" Sequence: {seq_len:,} tokens")
print(f" Output shape: {output.shape}")
print(f" Time: {elapsed:.2f}s")
print(f" Window size: 4096")
print(f" Global tokens: 256")


def main():
print("OpenMythos Long-Context Inference Demo")
print("Consumer Hardware Edition")

# Check if CUDA is available
if torch.cuda.is_available():
print(f"\nGPU: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB")
else:
print("\nRunning on CPU (demos will be slower)")

# Run demos
demo_ring_attention()
demo_kv_cache()
demo_sparse_attention()

print("\n" + "=" * 60)
print("Summary")
print("=" * 60)
print("Ring Attention: Enables 1M+ context with chunked processing")
print("KV Cache INT4: 4x compression for cached KV states")
print("Sparse Attention: Sliding window + global tokens")
print("\nCombined: 1M context on ~12GB VRAM (RTX 3060)")
print("=" * 60)


if __name__ == "__main__":
main()
93 changes: 93 additions & 0 deletions examples/quantized_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
"""
Quantized Inference Example for OpenMythos.

Demonstrates running mythos_1b with INT4 quantization and expert offloading
on consumer hardware (RTX 3060 12GB).

Usage:
python examples/quantized_inference.py
"""

import torch
import time
import sys
import os

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from open_mythos import OpenMythos, mythos_1b
from open_mythos.quantization import quantize_model, print_quantization_summary
from open_mythos.expert_offloader import ExpertOffloader


def main():
print("=" * 60)
print("OpenMythos Quantized Inference Demo")
print("=" * 60)

# 1. Create model
print("\n[1/5] Creating mythos_1b model...")
cfg = mythos_1b()
model = OpenMythos(cfg)
print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}")

# 2. Quantize to INT4
print("\n[2/5] Quantizing to INT4 (expert FFN layers only)...")
model = quantize_model(model, bits=4, group_size=128)
print_quantization_summary(model)

# 3. Setup expert offloading
print("\n[3/5] Setting up expert offloading...")
print(f" GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

if torch.cuda.is_available():
offloader = ExpertOffloader(
model,
gpu_experts=4, # Keep 4 experts on GPU
cache_experts=16, # Keep 16 in CPU RAM
)
offloader.prepare()
print(f" GPU experts: 4 | CPU cache: 16 | Disk: rest")
else:
print(" Running on CPU (no offloading needed)")

# 4. Generate text
print("\n[4/5] Generating text...")
input_ids = torch.randint(0, cfg.vocab_size, (1, 32))
if torch.cuda.is_available():
input_ids = input_ids.cuda()

# Warmup
_ = model.generate(input_ids, max_new_tokens=4, n_loops=2)

# Benchmark
start = time.time()
with torch.no_grad():
output = model.generate(input_ids, max_new_tokens=64, n_loops=4)
elapsed = time.time() - start

tokens_generated = output.shape[1] - input_ids.shape[1]
tokens_per_sec = tokens_generated / elapsed

print(f" Generated {tokens_generated} tokens in {elapsed:.2f}s")
print(f" Speed: {tokens_per_sec:.1f} tokens/sec")

# 5. Memory usage
print("\n[5/5] Memory usage:")
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024 / 1024
reserved = torch.cuda.memory_reserved() / 1024 / 1024
print(f" GPU allocated: {allocated:.1f} MB")
print(f" GPU reserved: {reserved:.1f} MB")

if torch.cuda.is_available():
print(f"\nOffloader stats:")
offloader.print_stats()

print("\n" + "=" * 60)
print("Done! Model runs successfully with INT4 quantization.")
print("=" * 60)


if __name__ == "__main__":
main()
Loading