diff --git a/src/training/model_eval/config_template.yaml b/src/training/model_eval/config_template.yaml new file mode 100644 index 00000000..8f80a02e --- /dev/null +++ b/src/training/model_eval/config_template.yaml @@ -0,0 +1,91 @@ +# Router Configuration Template +# This template is used by the reasoning evaluation script to generate optimized configs + +# BERT model configuration for semantic similarity +bert_model: + model_id: "sentence-transformers/all-MiniLM-L12-v2" + threshold: 0.6 + use_cpu: true + +# Semantic caching configuration +semantic_cache: + enabled: false # Disabled by default for development + similarity_threshold: 0.8 + max_entries: 1000 + ttl_seconds: 3600 + +# Tool selection configuration +tools: + enabled: true + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true + +# Prompt guard (jailbreak detection) configuration +prompt_guard: + enabled: false # Disabled by default for development + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +# Classification models configuration +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" + use_modernbert: true + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + +# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models +vllm_endpoints: + - name: "endpoint1" + address: "127.0.0.1" + port: 8000 + models: + - "" # Will be populated by evaluation script + weight: 1 # Load balancing weight + health_check_path: "/health" # Optional health check endpoint + +# Model-specific configuration +model_config: + # Will be populated by evaluation script with model-specific settings + # Example structure: + # "model-name": + # reasoning_family: "qwen3" # or "deepseek", "gpt-oss", etc. + # preferred_endpoints: ["endpoint1"] + # pii_policy: + # allow_by_default: true + +# These will be populated by the evaluation script +default_model: "" +default_reasoning_effort: "high" +categories: [] + +# Reasoning family configurations - define how different model families handle reasoning syntax +reasoning_families: + deepseek: + type: "chat_template_kwargs" + parameter: "thinking" + + qwen3: + type: "chat_template_kwargs" + parameter: "enable_thinking" + + gpt-oss: + type: "reasoning_effort" + parameter: "reasoning_effort" + + gpt: + type: "reasoning_effort" + parameter: "reasoning_effort" diff --git a/src/training/model_eval/reasoning_eval_consolidated.py b/src/training/model_eval/reasoning_eval_consolidated.py new file mode 100755 index 00000000..2daa95ab --- /dev/null +++ b/src/training/model_eval/reasoning_eval_consolidated.py @@ -0,0 +1,921 @@ +#!/usr/bin/env python3 +""" +Statistical Reasoning Mode Evaluation for LLM Router Configuration + +A model evaluation framework that determines optimal reasoning mode +configurations for large language model routers using statistical +analysis and evidence-based decision making. + +This module implements a multi-criteria statistical framework that evaluates +the effectiveness of reasoning modes across different task categories, generating +optimized router configurations based on empirical evidence rather than +arbitrary thresholds. + +Statistical Methodology: + - McNemar's test for paired binary outcomes + - Fisher's exact test for unpaired comparisons + - Cohen's h effect size analysis for practical significance + - Bayesian evidence estimation using Beta-Binomial conjugate priors + - Multi-pathway significance testing framework + +Key Features: + - Robust statistical inference for small sample sizes + - Evidence-based decision making with transparent reasoning + - Comprehensive effect size and practical significance analysis + - Bayesian probability estimation for intuitive interpretation + - Production-ready configuration generation + +Usage: + python reasoning_eval_consolidated.py \\ + --endpoint http://localhost:8000/v1 \\ + --samples-per-category 25 \\ + --output-config optimized_config.yaml + +Outputs: + - YAML configuration file with optimized reasoning decisions + - CSV file containing detailed evaluation results + - JSON file with comprehensive statistical analysis +""" + +import argparse +import json +import os +import re +import time +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Dict, List, Optional + +import numpy as np +import pandas as pd +import yaml +from datasets import load_dataset +from openai import OpenAI +from scipy import stats +from tqdm import tqdm + +# Regular expression pattern for extracting multiple choice answers from model responses +ANSWER_PATTERN = re.compile(r"(?:answer(?:\sis)?:?\s*)([A-J])", re.IGNORECASE) + +# Statistical significance thresholds +ALPHA_STRICT = 0.05 # Traditional significance level +ALPHA_RELAXED = 0.10 # Relaxed significance level for borderline cases +EFFECT_SIZE_MEDIUM = 0.2 # Cohen's h threshold for medium effect size +BAYESIAN_THRESHOLD = 0.8 # Bayesian probability threshold for strong evidence +MIN_IMPROVEMENT = 0.10 # Minimum improvement threshold for Bayesian pathway + +# Model API configuration +MAX_TOKENS_REASONING = ( + 6144 # Maximum tokens for reasoning mode (allows full reasoning chains) +) + + +def parse_args() -> argparse.Namespace: + """ + Parse command line arguments for the reasoning evaluation framework. + + Returns: + argparse.Namespace: Parsed command line arguments + """ + parser = argparse.ArgumentParser( + description="Statistical reasoning mode evaluation for LLM router configuration", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + # API Configuration + parser.add_argument( + "--endpoint", + type=str, + default="http://localhost:8000/v1", + help="API endpoint URL (default: %(default)s)", + ) + parser.add_argument( + "--api-key", + type=str, + default="", + help="API key (auto-detects OPENAI_API_KEY environment variable)", + ) + parser.add_argument( + "--use-openai", + action="store_true", + help="Use OpenAI API (sets endpoint to https://api.openai.com/v1)", + ) + + # Model Configuration + parser.add_argument( + "--models", + type=str, + nargs="*", + help="Models to evaluate (auto-discover if not specified)", + ) + + # Evaluation Parameters + parser.add_argument( + "--samples-per-category", + type=int, + default=5, + help="Number of questions per category (default: %(default)s)", + ) + parser.add_argument( + "--concurrent-requests", + type=int, + default=4, + help="Number of concurrent API requests (default: %(default)s)", + ) + + # Output Configuration + parser.add_argument( + "--output-config", + type=str, + default="config.yaml", + help="Output configuration file path (default: %(default)s)", + ) + parser.add_argument( + "--config-template", + type=str, + default="", + help="Path to configuration template YAML file", + ) + + # Statistical Parameters + parser.add_argument( + "--significance-level", + type=float, + default=ALPHA_STRICT, + help=f"Statistical significance level (default: {ALPHA_STRICT})", + ) + + return parser.parse_args() + + +def get_models(endpoint: str, api_key: str) -> List[str]: + """ + Discover available models from the specified API endpoint. + + Args: + endpoint: API endpoint URL + api_key: API authentication key + + Returns: + List of available model identifiers + + Raises: + None: Errors are logged and empty list is returned + """ + try: + # Validate OpenAI API requirements + if "api.openai.com" in endpoint and not api_key: + print( + "ERROR: OpenAI API requires authentication. Please set OPENAI_API_KEY environment variable or use --api-key" + ) + return [] + + # Initialize API client + client = OpenAI(base_url=endpoint, api_key=api_key or "dummy") + models_response = client.models.list() + model_list = [model.id for model in models_response.data] + + # Apply OpenAI-specific filtering for reasoning-capable models + if "api.openai.com" in endpoint: + reasoning_capable = ["gpt-5"] + filtered_models = [ + model + for model in model_list + if any(capability in model.lower() for capability in reasoning_capable) + ] + + if filtered_models: + print( + f"Discovered {len(filtered_models)} reasoning-capable models: {filtered_models}" + ) + return filtered_models + else: + print( + f"WARNING: No reasoning-capable models found. Available models: {model_list}" + ) + return model_list + + print(f"Discovered {len(model_list)} models from endpoint") + return model_list + + except Exception as e: + print(f"ERROR: Failed to discover models from {endpoint}: {e}") + return [] + + +def build_reasoning_params(model: str, reasoning: bool) -> Optional[Dict[str, Any]]: + """ + Construct model-specific reasoning parameters for API requests. + + Different model families use different parameter structures to enable + reasoning mode. This function maps model identifiers to their appropriate + reasoning parameter format. + + Args: + model: Model identifier string + reasoning: Whether to enable reasoning mode + + Returns: + Dictionary of reasoning parameters, or None if no parameters needed + + Note: + Parameter structures are based on model family conventions: + - DeepSeek models: chat_template_kwargs.thinking + - Qwen3 models: chat_template_kwargs.enable_thinking + - GPT-OSS models: reasoning_effort parameter + """ + model_lower = model.lower() + + # vLLM-hosted model families with specific reasoning implementations + if "deepseek" in model_lower and re.search(r"v3(?:[._]?1(?:\.\d+)?)?", model_lower): + # DeepSeek v3.x reasoning via chat template kwargs + return {"chat_template_kwargs": {"thinking": reasoning}} + elif "qwen3" in model_lower: + # Qwen3 reasoning via chat template kwargs + return {"chat_template_kwargs": {"enable_thinking": reasoning}} + elif "gpt-oss" in model_lower: + # GPT-OSS reasoning via effort parameter + return {"reasoning_effort": "high" if reasoning else "low"} + + # Model does not support reasoning parameters + return None + + +def format_prompt(question: str, options: List[str]) -> str: + """Format MMLU-Pro prompt.""" + letters = "ABCDEFGHIJ" + formatted = "\n".join( + f"{letters[i]}) {opt}" for i, opt in enumerate(options) if opt.lower() != "n/a" + ) + return f"Question: {question}\n\nOptions:\n{formatted}\n\nProvide your answer in the format 'Answer: [letter]'." + + +def extract_answer(response: str) -> Optional[str]: + """Extract answer from model response.""" + if not response: + return None + match = ANSWER_PATTERN.search(response) + if match: + return match.group(1).upper() + for char in reversed(response): + if char.upper() in "ABCDEFGHIJ": + return char.upper() + return None + + +def call_model( + client: OpenAI, + model: str, + prompt: str, + extra_body: Optional[Dict] = None, + debug_print: bool = False, +) -> Dict[str, Any]: + """Call model and return result.""" + try: + start = time.time() + + # Build request parameters + params = { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": MAX_TOKENS_REASONING, + "temperature": 0.0, + } + + # Add reasoning parameters based on model type + if extra_body: + if "reasoning" in extra_body: + # OpenAI reasoning parameter goes at top level + params["reasoning"] = extra_body["reasoning"] + if debug_print: + print( + f"šŸ”§ OpenAI reasoning param: reasoning={extra_body['reasoning']}" + ) + else: + # All vLLM parameters (including chat_template_kwargs) go in extra_body + # The Python OpenAI client doesn't support chat_template_kwargs at top level + params["extra_body"] = extra_body + if debug_print: + print(f"šŸ”§ vLLM extra_body: {extra_body}") + elif debug_print: + print(f"šŸ”§ No reasoning params for {model}") + + response = client.chat.completions.create(**params) + + text = response.choices[0].message.content + usage = getattr(response, "usage", None) + + return { + "response": text, + "success": True, + "time": time.time() - start, + "prompt_tokens": getattr(usage, "prompt_tokens", None) if usage else None, + "completion_tokens": ( + getattr(usage, "completion_tokens", None) if usage else None + ), + "total_tokens": getattr(usage, "total_tokens", None) if usage else None, + } + except Exception as e: + return { + "response": str(e), + "success": False, + "time": 0, + "prompt_tokens": None, + "completion_tokens": None, + "total_tokens": None, + } + + +def evaluate_model( + model: str, endpoint: str, api_key: str, df: pd.DataFrame, concurrent: int +) -> pd.DataFrame: + """Evaluate model with NR and NR_REASONING modes.""" + client = OpenAI(base_url=endpoint, api_key=api_key or "dummy") + print(f"Evaluating {model} with {len(df)} questions...") + + # Special handling for o1 models (always use reasoning, no separate modes) + if "o1" in model.lower(): + print("Note: o1 models always use reasoning. Running single mode evaluation.") + modes = [("NR_REASONING", True)] # Only one mode for o1 + else: + modes = [("NR", False), ("NR_REASONING", True)] + + # Create all tasks (question Ɨ mode combinations) + tasks = [] + for _, row in df.iterrows(): + prompt = format_prompt(row["question"], row["options"]) + for mode, reasoning in modes: + extra_body = build_reasoning_params(model, reasoning) + tasks.append((row, prompt, mode, extra_body)) + + print(f"Total tasks: {len(tasks)} ({len(df)} questions Ɨ {len(modes)} modes)") + + # Track debug printing with a counter + debug_counter = {"count": 0} + + def process_task(task): + row, prompt, mode, extra_body = task + # Enable debug printing for first 2 tasks only + debug_print = debug_counter["count"] < 2 + if debug_print: + debug_counter["count"] += 1 + result = call_model(client, model, prompt, extra_body, debug_print=debug_print) + predicted = extract_answer(result["response"]) if result["success"] else None + + return { + "question_id": row["question_id"], + "category": row["category"], + "correct_answer": row["answer"], + "predicted_answer": predicted, + "is_correct": predicted == row["answer"] if predicted else False, + "mode": mode, + "success": result["success"], + "response_time": result["time"], + "prompt_tokens": result["prompt_tokens"], + "completion_tokens": result["completion_tokens"], + "total_tokens": result["total_tokens"], + } + + # Process tasks concurrently + results = [] + with ThreadPoolExecutor(max_workers=concurrent) as executor: + futures = [executor.submit(process_task, task) for task in tasks] + try: + for future in tqdm(futures, desc=f"Evaluating {model}"): + results.append(future.result()) + except KeyboardInterrupt: + print("\nāš ļø Evaluation interrupted by user. Cancelling remaining tasks...") + # Cancel remaining futures + for future in futures: + future.cancel() + # Collect results from completed futures + for future in futures: + if future.done() and not future.cancelled(): + try: + results.append(future.result()) + except Exception: + pass # Skip failed results + if not results: + print("āŒ No results to save.") + raise + print(f"āœ… Saved {len(results)} partial results.") + + return pd.DataFrame(results) + + +def analyze_reasoning_statistical(results_df: pd.DataFrame) -> Dict[str, Dict]: + """ + Analyze reasoning effectiveness using enhanced statistical methodology. + + This function implements a multi-criteria statistical framework that goes beyond + simple p-value thresholds to make evidence-based decisions about reasoning mode + effectiveness across different categories. + + STATISTICAL METHODOLOGY: + ======================= + + 1. SIGNIFICANCE TESTING: + - McNemar's test for paired data (same questions, different modes) + - Fisher's exact test for unpaired data as fallback + - Appropriate for small sample sizes with exact p-values + + 2. EFFECT SIZE CALCULATION: + - Cohen's h for proportion differences + - Formula: h = 2 * (arcsin(√pā‚‚) - arcsin(√p₁)) + - Provides standardized measure of practical significance + + 3. BAYESIAN ANALYSIS: + - Beta-binomial conjugate prior model + - Uniform priors: Beta(1,1) for both conditions + - Monte Carlo estimation of P(reasoning > baseline) + + 4. MULTI-CRITERIA DECISION: + Reasoning is enabled if ANY of these criteria are met: + a) Traditional significance: p < 0.05 + b) Borderline significance: p < 0.10 AND |Cohen's h| > 0.2 + c) Bayesian evidence: P(reasoning > baseline) > 80% AND improvement > 10% + + Args: + results_df (pd.DataFrame): Evaluation results with columns: + - category: Question category + - mode: 'NR' or 'NR_REASONING' + - is_correct: Boolean correctness + - total_tokens: Token usage + - success: Whether evaluation succeeded + + Returns: + Dict[str, Dict]: Statistical analysis results for each category containing: + - use_reasoning: Boolean decision + - reasoning_effort: 'low', 'medium', or 'high' + - reason: Detailed explanation of decision + - nr_accuracy: Baseline accuracy + - nr_reasoning_accuracy: Reasoning mode accuracy + - improvement: Absolute improvement + - token_overhead: Token usage multiplier + - p_value: Statistical significance + - cohens_h: Effect size + - bayesian_prob: Probability reasoning is better + - sample_size: Number of questions evaluated + - traditional_significant: p < 0.05 + - borderline_significant: p < 0.10 AND medium effect + - bayesian_significant: Strong Bayesian evidence + + INTERPRETATION GUIDE: + ==================== + - p_value < 0.05: Strong statistical evidence + - 0.05 ≤ p_value < 0.10 with |h| > 0.2: Moderate evidence with meaningful effect + - bayesian_prob > 80%: Strong probabilistic evidence + - |cohens_h| > 0.2: Practically meaningful difference + - improvement > 5%: Substantial practical benefit + + Example: + >>> df = pd.read_csv('evaluation_results.csv') + >>> decisions = analyze_reasoning_statistical(df) + >>> enabled_categories = [cat for cat, dec in decisions.items() + ... if dec['use_reasoning']] + """ + decisions = {} + + for category in results_df["category"].unique(): + cat_df = results_df[ + (results_df["category"] == category) & (results_df["success"]) + ] + + nr_df = cat_df[cat_df["mode"] == "NR"] + nr_reasoning_df = cat_df[cat_df["mode"] == "NR_REASONING"] + + if nr_df.empty or nr_reasoning_df.empty: + decisions[category] = { + "use_reasoning": False, + "reasoning_effort": "low", + "reason": "No data", + "nr_accuracy": 0.0, + "nr_reasoning_accuracy": 0.0, + "improvement": 0.0, + "token_overhead": 0.0, + "p_value": 1.0, + "sample_size": 0, + "statistically_significant": False, + } + continue + + # Get binary results for statistical testing + nr_results = nr_df["is_correct"].values + nr_reasoning_results = nr_reasoning_df["is_correct"].values + + nr_acc = nr_results.mean() + nr_reasoning_acc = nr_reasoning_results.mean() + nr_tokens = nr_df["total_tokens"].dropna().mean() or 0 + nr_reasoning_tokens = nr_reasoning_df["total_tokens"].dropna().mean() or 0 + + improvement = nr_reasoning_acc - nr_acc + overhead = nr_reasoning_tokens / nr_tokens if nr_tokens > 0 else float("inf") + + # STEP 1: STATISTICAL SIGNIFICANCE TESTING + # ======================================== + # We use different tests depending on whether we have paired data + # (same questions evaluated in both modes) or unpaired data. + + if len(nr_results) == len(nr_reasoning_results): + # PAIRED DATA: McNemar's Test + # --------------------------- + # McNemar's test is appropriate for paired binary data where the same + # subjects (questions) are tested under two different conditions (modes). + # It focuses on the discordant pairs - cases where the two modes disagree. + + # Create 2x2 contingency table for paired responses: + # Reasoning Mode + # Correct Incorrect + # NR Mode Correct a b <- b = nr_only_correct + # Incorrect c d <- c = reasoning_only_correct + + both_correct = np.sum((nr_results == 1) & (nr_reasoning_results == 1)) # a + nr_only_correct = np.sum( + (nr_results == 1) & (nr_reasoning_results == 0) + ) # b + reasoning_only_correct = np.sum( + (nr_results == 0) & (nr_reasoning_results == 1) + ) # c + both_wrong = np.sum((nr_results == 0) & (nr_reasoning_results == 0)) # d + + # McNemar's test statistic focuses on discordant pairs (b + c) + # Under null hypothesis: P(b) = P(c), so (b-c)² / (b+c) ~ χ²(1) + discordant_pairs = nr_only_correct + reasoning_only_correct + + if discordant_pairs > 0: + # Apply continuity correction for small samples: |b-c| - 1 + # This makes the test more conservative for small sample sizes + mcnemar_stat = ( + abs(nr_only_correct - reasoning_only_correct) - 1 + ) ** 2 / discordant_pairs + p_value = 1 - stats.chi2.cdf(mcnemar_stat, 1) + else: + # No discordant pairs means identical performance + p_value = 1.0 + else: + # UNPAIRED DATA: Fisher's Exact Test + # ---------------------------------- + # When sample sizes differ, we can't pair the data, so we use + # Fisher's exact test for 2x2 contingency tables. This test + # provides exact p-values regardless of sample size. + + nr_correct = np.sum(nr_results) + nr_total = len(nr_results) + reasoning_correct = np.sum(nr_reasoning_results) + reasoning_total = len(nr_reasoning_results) + + # Create 2x2 contingency table: + # Correct Incorrect Total + # NR Mode a b n1 + # Reasoning c d n2 + contingency_table = [ + [nr_correct, nr_total - nr_correct], + [reasoning_correct, reasoning_total - reasoning_correct], + ] + + # Fisher's exact test computes exact p-value using hypergeometric distribution + _, p_value = stats.fisher_exact(contingency_table, alternative="two-sided") + + # STEP 2: EFFECT SIZE CALCULATION + # =============================== + # Cohen's h measures the effect size for differences between proportions. + # It's based on the arcsine transformation, which stabilizes variance + # and makes the metric more interpretable across different base rates. + + # Formula: h = 2 * (arcsin(√pā‚‚) - arcsin(√p₁)) + # Interpretation: |h| < 0.2 (small), 0.2-0.5 (medium), >0.5 (large) + if nr_acc > 0 and nr_reasoning_acc > 0: + cohens_h = 2 * ( + np.arcsin(np.sqrt(nr_reasoning_acc)) - np.arcsin(np.sqrt(nr_acc)) + ) + else: + cohens_h = 0 + + # STEP 3: BAYESIAN ANALYSIS + # ========================= + # We use a Beta-Binomial conjugate prior model to estimate the probability + # that reasoning mode is actually better than baseline. This provides an + # intuitive probabilistic interpretation of the evidence. + + from scipy.stats import beta + + # Set up Beta posteriors with uniform priors Beta(1,1) + # Posterior: Beta(successes + 1, failures + 1) + nr_successes = np.sum(nr_results) + nr_failures = len(nr_results) - nr_successes + reasoning_successes = np.sum(nr_reasoning_results) + reasoning_failures = len(nr_reasoning_results) - reasoning_successes + + nr_posterior = beta(nr_successes + 1, nr_failures + 1) + reasoning_posterior = beta(reasoning_successes + 1, reasoning_failures + 1) + + # Monte Carlo estimation of P(reasoning_accuracy > baseline_accuracy) + # This gives us the probability that reasoning mode is actually better + np.random.seed(42) # For reproducible results + n_samples = 10000 + nr_samples = nr_posterior.rvs(n_samples) + reasoning_samples = reasoning_posterior.rvs(n_samples) + bayesian_prob = np.mean(reasoning_samples > nr_samples) + + # STEP 4: MULTI-CRITERIA DECISION FRAMEWORK + # ========================================= + # We use three pathways to significance, allowing for different types + # of evidence to support the decision to enable reasoning mode. + + # Apply multi-criteria decision framework + traditional_significant = p_value < ALPHA_STRICT + borderline_significant = (p_value < ALPHA_RELAXED) and ( + abs(cohens_h) > EFFECT_SIZE_MEDIUM + ) + bayesian_significant = (bayesian_prob > BAYESIAN_THRESHOLD) and ( + improvement > MIN_IMPROVEMENT + ) + + # Aggregate significance across all pathways + statistically_significant = ( + traditional_significant or borderline_significant or bayesian_significant + ) + + # Final decision requires both significance and positive improvement + use_reasoning = statistically_significant and improvement > 0 + + # Determine reasoning effort level based on improvement magnitude + if improvement >= 0.15: + effort = "high" + elif improvement >= 0.10: + effort = "medium" + else: + effort = "low" + + # Generate detailed reasoning explanation + significance_criteria = [] + if traditional_significant: + significance_criteria.append(f"Traditional significance (p={p_value:.3f})") + if borderline_significant: + significance_criteria.append( + f"Borderline significance with medium effect (p={p_value:.3f}, h={cohens_h:.2f})" + ) + if bayesian_significant: + significance_criteria.append( + f"Strong Bayesian evidence (P={bayesian_prob:.1%}, Ī”={improvement:.1%})" + ) + + if not statistically_significant: + reason = f"Insufficient evidence: p={p_value:.3f}, h={cohens_h:.2f}, P(better)={bayesian_prob:.1%}" + elif improvement <= 0: + reason = f"Significant but detrimental effect: {improvement:.1%} ({'; '.join(significance_criteria)})" + else: + reason = f"Evidence-based improvement: {improvement:.1%} ({'; '.join(significance_criteria)})" + + decisions[category] = { + "use_reasoning": bool(use_reasoning), + "reasoning_effort": effort, + "reason": reason, + "nr_accuracy": float(nr_acc), + "nr_reasoning_accuracy": float(nr_reasoning_acc), + "improvement": float(improvement), + "token_overhead": float(overhead), + "p_value": float(p_value), + "sample_size": len(nr_results), + "statistically_significant": bool(statistically_significant), + "cohens_h": float(cohens_h), + "bayesian_prob": float(bayesian_prob), + "traditional_significant": bool(traditional_significant), + "borderline_significant": bool(borderline_significant), + "bayesian_significant": bool(bayesian_significant), + } + + return decisions + + +def load_config_template(template_path: str = "") -> Dict: + """Load configuration template from YAML file.""" + if not template_path: + template_path = os.path.join(os.path.dirname(__file__), "config_template.yaml") + + try: + with open(template_path, "r") as f: + config = yaml.safe_load(f) + print(f"āœ… Loaded config template from: {template_path}") + return config + except FileNotFoundError: + print(f"āš ļø Warning: Template file not found at {template_path}") + return None + + +def generate_config( + results_df: pd.DataFrame, + reasoning_decisions: Dict, + model: str, + template_path: str = "", +) -> Dict: + """ + Generate router configuration from template with complete model and endpoint setup. + + Args: + results_df: Evaluation results DataFrame + reasoning_decisions: Statistical analysis decisions for each category + model: Primary model identifier + template_path: Path to configuration template file + + Returns: + Complete router configuration dictionary + """ + # Load template + config = load_config_template(template_path) + if config is None: + print("āŒ Cannot generate config without template. Exiting.") + return None + + # Set model-specific values + config["default_model"] = model + + # Configure vLLM endpoints + if "vllm_endpoints" in config and config["vllm_endpoints"]: + # Update the first endpoint with the actual model + config["vllm_endpoints"][0]["models"] = [model] + + # Configure model-specific settings + if "model_config" not in config: + config["model_config"] = {} + + # Determine reasoning family based on model name + model_lower = model.lower() + if "qwen3" in model_lower: + reasoning_family = "qwen3" + elif "deepseek" in model_lower and re.search( + r"v3(?:[._]?1(?:\.\d+)?)?", model_lower + ): + reasoning_family = "deepseek" + elif "gpt-oss" in model_lower: + reasoning_family = "gpt-oss" + elif "gpt" in model_lower: + reasoning_family = "gpt" + else: + reasoning_family = "gpt" # Default fallback + + # Add model configuration + config["model_config"][model] = { + "reasoning_family": reasoning_family, + "preferred_endpoints": ["endpoint1"], + "pii_policy": {"allow_by_default": True}, + } + + # Add categories with reasoning decisions + config["categories"] = [] + for category, decision in reasoning_decisions.items(): + # Get best accuracy for model scoring (use reasoning accuracy if enabled, otherwise baseline) + cat_df = results_df[ + (results_df["category"] == category) & (results_df["success"]) + ] + if decision["use_reasoning"]: + best_acc = decision["nr_reasoning_accuracy"] + else: + best_acc = decision["nr_accuracy"] + + config["categories"].append( + { + "name": category, + "use_reasoning": decision["use_reasoning"], + "reasoning_description": f"Data-driven decision: {decision['reason']}", + "reasoning_effort": decision["reasoning_effort"], + "model_scores": [{"model": model, "score": float(best_acc)}], + } + ) + + return config + + +def main(): + args = parse_args() + + # Handle OpenAI API setup + if args.use_openai: + args.endpoint = "https://api.openai.com/v1" + print("Using OpenAI API endpoint") + + # Auto-detect API key from environment if not provided + if not args.api_key: + args.api_key = os.environ.get("OPENAI_API_KEY", "") + if args.api_key: + print("Using API key from OPENAI_API_KEY environment variable") + + # Validate API key for OpenAI + if "api.openai.com" in args.endpoint and not args.api_key: + print("āŒ OpenAI API requires an API key. Please:") + print(" 1. Set OPENAI_API_KEY environment variable, or") + print(" 2. Use --api-key parameter") + return + + print(f"Endpoint: {args.endpoint}") + + # Get models + if not args.models: + print("Auto-discovering models...") + args.models = get_models(args.endpoint, args.api_key) + if not args.models: + print("No models found!") + return + print(f"Found models: {args.models}") + else: + print(f"Using specified models: {args.models}") + + # Load dataset + print("Loading MMLU-Pro dataset...") + dataset = load_dataset("TIGER-Lab/MMLU-Pro", split="test") + df = pd.DataFrame(dataset) + + # Sample questions per category + if args.samples_per_category: + sampled = [] + for category in df["category"].unique(): + cat_df = df[df["category"] == category] + sample_size = min(args.samples_per_category, len(cat_df)) + sampled.append(cat_df.sample(sample_size, random_state=42)) + df = pd.concat(sampled) + + print( + f"Evaluating {len(df)} questions across {df['category'].nunique()} categories" + ) + + # Evaluate each model + all_results = [] + try: + for model in args.models: + results_df = evaluate_model( + model, args.endpoint, args.api_key, df, args.concurrent_requests + ) + results_df["model"] = model + all_results.append(results_df) + except KeyboardInterrupt: + print("\nāš ļø Evaluation interrupted by user.") + if not all_results: + print("āŒ No complete model evaluations to save.") + return + print(f"āœ… Proceeding with {len(all_results)} completed model evaluations.") + + # Combine results + combined_df = pd.concat(all_results, ignore_index=True) + + # Analyze reasoning effectiveness using statistical significance + print("\nAnalyzing reasoning effectiveness using statistical significance...") + reasoning_decisions = analyze_reasoning_statistical(combined_df) + + # Print analysis + reasoning_enabled = sum( + 1 for d in reasoning_decisions.values() if d["use_reasoning"] + ) + print( + f"\nReasoning Analysis ({reasoning_enabled}/{len(reasoning_decisions)} categories enabled):" + ) + for category, decision in reasoning_decisions.items(): + status = "āœ“ ENABLED" if decision["use_reasoning"] else "āœ— DISABLED" + acc_change = ( + f"{decision['nr_accuracy']:.1%} → {decision['nr_reasoning_accuracy']:.1%}" + ) + tokens = f"{decision['token_overhead']:.1f}x tokens" + p_val = f"p={decision['p_value']:.3f}" + print(f" {category}: {status}") + print(f" Accuracy: {acc_change} ({decision['improvement']:+.1%})") + print( + f" Statistical: {p_val} ({'significant' if decision['statistically_significant'] else 'not significant'})" + ) + print( + f" Effect size: h={decision['cohens_h']:.2f}, Bayesian prob={decision['bayesian_prob']:.1%}" + ) + print(f" Cost: {tokens}") + print(f" Reason: {decision['reason']}") + print() + + # Generate config (use first model as default) + config = generate_config( + combined_df, reasoning_decisions, args.models[0], args.config_template + ) + + # Save config + os.makedirs( + ( + os.path.dirname(args.output_config) + if os.path.dirname(args.output_config) + else "." + ), + exist_ok=True, + ) + with open(args.output_config, "w") as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False) + + # Save detailed results + results_file = args.output_config.replace(".yaml", "_results.csv") + combined_df.to_csv(results_file, index=False) + + # Save reasoning analysis + analysis_file = args.output_config.replace(".yaml", "_analysis.json") + with open(analysis_file, "w") as f: + json.dump(reasoning_decisions, f, indent=2) + + print(f"\nāœ… Config saved to: {args.output_config}") + print(f"šŸ“Š Results saved to: {results_file}") + print(f"šŸ“ˆ Analysis saved to: {analysis_file}") + + +if __name__ == "__main__": + main()