Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c0fc9e8
Update API_BASE to use environment variable
isam1978mm Mar 21, 2026
358775c
Update council and chairman models in config
isam1978mm Mar 21, 2026
18b9f72
Fix indentation in config.py for COUNCIL_MODELS
isam1978mm Mar 21, 2026
7333286
Update CORS settings to allow all origins
isam1978mm Mar 21, 2026
c89e31c
Refactor config management and add JSON support
isam1978mm Mar 21, 2026
c912593
Update main.py
isam1978mm Mar 21, 2026
c0e9b6d
Add API endpoints for getting and updating config
isam1978mm Mar 21, 2026
9998626
Refactor API methods and add config endpoints
isam1978mm Mar 21, 2026
dfebb53
Add Settings component for council management
isam1978mm Mar 21, 2026
fdcb305
Implement settings button and modal in App
isam1978mm Mar 21, 2026
8302c6f
Refactor council orchestration to use load_config
isam1978mm Mar 21, 2026
24ae97e
Add supabase dependency to pyproject.toml
isam1978mm Mar 21, 2026
8b9f6cb
Refactor storage to use Supabase for conversations
isam1978mm Mar 21, 2026
807fe26
Fix indentation in update_conversation_title function
isam1978mm Mar 21, 2026
ad9d6a1
Add Nixpacks configuration for Python and Uvicorn
isam1978mm Mar 21, 2026
9865ab4
Rename uv.lockxxxx to uv.lock
isam1978mm Mar 21, 2026
223958f
Add initial requirements for FastAPI project
isam1978mm Mar 21, 2026
53ec55b
Add model appearance recording and statistics retrieval
isam1978mm Mar 22, 2026
d5b3bda
Record model stats in Supabase after aggregation
isam1978mm Mar 22, 2026
5f70152
Implement API endpoint for model stats
isam1978mm Mar 22, 2026
97ffe34
Add getConfig and getStats API methods
isam1978mm Mar 22, 2026
6959e4e
Add Leaderboard component to display model stats
isam1978mm Mar 22, 2026
74f709a
Add leaderboard button to App component
isam1978mm Mar 22, 2026
9a232a3
Implement error handling for model stats recording
isam1978mm Mar 23, 2026
42b438d
Implement '/api/test-stats' endpoint
isam1978mm Mar 23, 2026
93cc216
Update storage.py
isam1978mm Mar 23, 2026
3904e47
Remove test_stats API endpoint
isam1978mm Mar 23, 2026
d22ce71
Update council.py
isam1978mm Mar 23, 2026
9c07bf1
Flush print statements to stderr for debugging
isam1978mm Mar 23, 2026
c852493
Update council.py
isam1978mm Mar 23, 2026
63d4f87
Update main.py
isam1978mm Mar 23, 2026
52ea04c
Fix header typo in README.md
isam1978mm Mar 23, 2026
c175d15
Make frontend layout responsive
isam1978mm Mar 23, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ In a bit more detail, here is what happens when you submit a query:

This project was 99% vibe coded as a fun Saturday hack because I wanted to explore and evaluate a number of LLMs side by side in the process of [reading books together with LLMs](https://x.com/karpathy/status/1990577951671509438). It's nice and useful to see multiple responses side by side, and also the cross-opinions of all LLMs on each other's outputs. I'm not going to support it in any way, it's provided here as is for other people's inspiration and I don't intend to improve it. Code is ephemeral now and libraries are over, ask your LLM to change it in whatever way you like.

## Setup
## Setup d

### 1. Install Dependencies

Expand Down
50 changes: 39 additions & 11 deletions backend/config.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,54 @@
"""Configuration for the LLM Council."""

import os
import json
from dotenv import load_dotenv

load_dotenv()

# OpenRouter API key
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

# Council members - list of OpenRouter model identifiers
COUNCIL_MODELS = [
"openai/gpt-5.1",
"google/gemini-3-pro-preview",
"anthropic/claude-sonnet-4.5",
"x-ai/grok-4",
]

# Chairman model - synthesizes final response
CHAIRMAN_MODEL = "google/gemini-3-pro-preview"

# OpenRouter API endpoint
OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"

# Data directory for conversation storage
DATA_DIR = "data/conversations"

# Config file for dynamic model settings
CONFIG_FILE = "data/council_config.json"

# Default models
DEFAULT_COUNCIL_MODELS = [
"google/gemini-2.5-flash-lite",
"deepseek/deepseek-chat-v3.1",
"openai/gpt-5-mini",
]
DEFAULT_CHAIRMAN_MODEL = "openai/gpt-5-mini"


def load_config():
"""Load config from file, fallback to defaults."""
if os.path.exists(CONFIG_FILE):
with open(CONFIG_FILE, "r") as f:
return json.load(f)
return {
"council_models": DEFAULT_COUNCIL_MODELS,
"chairman_model": DEFAULT_CHAIRMAN_MODEL,
}


def save_config(council_models, chairman_model):
"""Save config to file."""
os.makedirs(os.path.dirname(CONFIG_FILE), exist_ok=True)
with open(CONFIG_FILE, "w") as f:
json.dump({
"council_models": council_models,
"chairman_model": chairman_model,
}, f)


# Load on startup
_config = load_config()
COUNCIL_MODELS = _config["council_models"]
CHAIRMAN_MODEL = _config["chairman_model"]
113 changes: 25 additions & 88 deletions backend/council.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,26 @@
"""3-stage LLM Council orchestration."""

import logging
from typing import List, Dict, Any, Tuple
from .openrouter import query_models_parallel, query_model
from .config import COUNCIL_MODELS, CHAIRMAN_MODEL
from .config import load_config
from . import storage

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


async def stage1_collect_responses(user_query: str) -> List[Dict[str, Any]]:
"""
Stage 1: Collect individual responses from all council models.

Args:
user_query: The user's question

Returns:
List of dicts with 'model' and 'response' keys
"""
messages = [{"role": "user", "content": user_query}]

# Query all models in parallel
responses = await query_models_parallel(COUNCIL_MODELS, messages)
responses = await query_models_parallel(load_config()["council_models"], messages)

# Format results
stage1_results = []
for model, response in responses.items():
if response is not None: # Only include successful responses
if response is not None:
stage1_results.append({
"model": model,
"response": response.get('content', '')
Expand All @@ -38,24 +35,14 @@ async def stage2_collect_rankings(
) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
"""
Stage 2: Each model ranks the anonymized responses.

Args:
user_query: The original user query
stage1_results: Results from Stage 1

Returns:
Tuple of (rankings list, label_to_model mapping)
"""
# Create anonymized labels for responses (Response A, Response B, etc.)
labels = [chr(65 + i) for i in range(len(stage1_results))] # A, B, C, ...
labels = [chr(65 + i) for i in range(len(stage1_results))]

# Create mapping from label to model name
label_to_model = {
f"Response {label}": result['model']
for label, result in zip(labels, stage1_results)
}

# Build the ranking prompt
responses_text = "\n\n".join([
f"Response {label}:\n{result['response']}"
for label, result in zip(labels, stage1_results)
Expand Down Expand Up @@ -94,10 +81,8 @@ async def stage2_collect_rankings(

messages = [{"role": "user", "content": ranking_prompt}]

# Get rankings from all council models in parallel
responses = await query_models_parallel(COUNCIL_MODELS, messages)
responses = await query_models_parallel(load_config()["council_models"], messages)

# Format results
stage2_results = []
for model, response in responses.items():
if response is not None:
Expand All @@ -119,16 +104,9 @@ async def stage3_synthesize_final(
) -> Dict[str, Any]:
"""
Stage 3: Chairman synthesizes final response.

Args:
user_query: The original user query
stage1_results: Individual model responses from Stage 1
stage2_results: Rankings from Stage 2

Returns:
Dict with 'model' and 'response' keys
"""
# Build comprehensive context for chairman
chairman_model = load_config()["chairman_model"]

stage1_text = "\n\n".join([
f"Model: {result['model']}\nResponse: {result['response']}"
for result in stage1_results
Expand Down Expand Up @@ -158,52 +136,36 @@ async def stage3_synthesize_final(

messages = [{"role": "user", "content": chairman_prompt}]

# Query the chairman model
response = await query_model(CHAIRMAN_MODEL, messages)
response = await query_model(chairman_model, messages)

if response is None:
# Fallback if chairman fails
return {
"model": CHAIRMAN_MODEL,
"model": chairman_model,
"response": "Error: Unable to generate final synthesis."
}

return {
"model": CHAIRMAN_MODEL,
"model": chairman_model,
"response": response.get('content', '')
}


def parse_ranking_from_text(ranking_text: str) -> List[str]:
"""
Parse the FINAL RANKING section from the model's response.

Args:
ranking_text: The full text response from the model

Returns:
List of response labels in ranked order
"""
import re

# Look for "FINAL RANKING:" section
if "FINAL RANKING:" in ranking_text:
# Extract everything after "FINAL RANKING:"
parts = ranking_text.split("FINAL RANKING:")
if len(parts) >= 2:
ranking_section = parts[1]
# Try to extract numbered list format (e.g., "1. Response A")
# This pattern looks for: number, period, optional space, "Response X"
numbered_matches = re.findall(r'\d+\.\s*Response [A-Z]', ranking_section)
if numbered_matches:
# Extract just the "Response X" part
return [re.search(r'Response [A-Z]', m).group() for m in numbered_matches]

# Fallback: Extract all "Response X" patterns in order
matches = re.findall(r'Response [A-Z]', ranking_section)
return matches

# Fallback: try to find any "Response X" patterns in order
matches = re.findall(r'Response [A-Z]', ranking_text)
return matches

Expand All @@ -214,31 +176,20 @@ def calculate_aggregate_rankings(
) -> List[Dict[str, Any]]:
"""
Calculate aggregate rankings across all models.

Args:
stage2_results: Rankings from each model
label_to_model: Mapping from anonymous labels to model names

Returns:
List of dicts with model name and average rank, sorted best to worst
"""
from collections import defaultdict

# Track positions for each model
model_positions = defaultdict(list)

for ranking in stage2_results:
ranking_text = ranking['ranking']

# Parse the ranking from the structured format
parsed_ranking = parse_ranking_from_text(ranking_text)

for position, label in enumerate(parsed_ranking, start=1):
if label in label_to_model:
model_name = label_to_model[label]
model_positions[model_name].append(position)

# Calculate average position for each model
aggregate = []
for model, positions in model_positions.items():
if positions:
Expand All @@ -249,7 +200,6 @@ def calculate_aggregate_rankings(
"rankings_count": len(positions)
})

# Sort by average rank (lower is better)
aggregate.sort(key=lambda x: x['average_rank'])

return aggregate
Expand All @@ -258,12 +208,6 @@ def calculate_aggregate_rankings(
async def generate_conversation_title(user_query: str) -> str:
"""
Generate a short title for a conversation based on the first user message.

Args:
user_query: The first user message

Returns:
A short title (3-5 words)
"""
title_prompt = f"""Generate a very short title (3-5 words maximum) that summarizes the following question.
The title should be concise and descriptive. Do not use quotes or punctuation in the title.
Expand All @@ -274,19 +218,14 @@ async def generate_conversation_title(user_query: str) -> str:

messages = [{"role": "user", "content": title_prompt}]

# Use gemini-2.5-flash for title generation (fast and cheap)
response = await query_model("google/gemini-2.5-flash", messages, timeout=30.0)

if response is None:
# Fallback to a generic title
return "New Conversation"

title = response.get('content', 'New Conversation').strip()

# Clean up the title - remove quotes, limit length
title = title.strip('"\'')

# Truncate if too long
if len(title) > 50:
title = title[:47] + "..."

Expand All @@ -296,37 +235,35 @@ async def generate_conversation_title(user_query: str) -> str:
async def run_full_council(user_query: str) -> Tuple[List, List, Dict, Dict]:
"""
Run the complete 3-stage council process.

Args:
user_query: The user's question

Returns:
Tuple of (stage1_results, stage2_results, stage3_result, metadata)
"""
# Stage 1: Collect individual responses
stage1_results = await stage1_collect_responses(user_query)

# If no models responded successfully, return error
if not stage1_results:
return [], [], {
"model": "error",
"response": "All models failed to respond. Please try again."
}, {}

# Stage 2: Collect rankings
stage2_results, label_to_model = await stage2_collect_rankings(user_query, stage1_results)

# Calculate aggregate rankings
aggregate_rankings = calculate_aggregate_rankings(stage2_results, label_to_model)

# Stage 3: Synthesize final answer
# Record model stats in Supabase
logger.info(f"DEBUG: Recording stats for models: {[r['model'] for r in stage1_results]}")
logger.info(f"DEBUG: aggregate_rankings: {aggregate_rankings}")
try:
all_models = [r["model"] for r in stage1_results]
storage.record_model_appearances(all_models, aggregate_rankings)
logger.info("DEBUG: Stats recorded successfully")
except Exception as e:
logger.exception(f"STATS ERROR: {e}")

stage3_result = await stage3_synthesize_final(
user_query,
stage1_results,
stage2_results
)

# Prepare metadata
metadata = {
"label_to_model": label_to_model,
"aggregate_rankings": aggregate_rankings
Expand Down
Loading