AidanBench/question_plots.py at main · aidanmclaughlin/AidanBench · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
from scipy import stats
from scipy.stats import t
import statsmodels.stats.api as sms
from typing import Tuple

COMPANY_COLORS = {
    'openai': '#74AA9C',
    'meta': '#044EAB',
    'anthropic': '#D4C5B9',
    'google': '#669DF7',
    'x': '#000000',
    'mistral': '#F54E42'
}


def _get_company_from_model(model_name: str) -> str:
    company = model_name.split('/')[0]
    if company == 'meta-llama':
        return 'meta'
    if company == 'x-ai':
        return 'x'
    if company == 'mistralai':
        return 'mistral'
    return company


def _load_results() -> dict:
    with open('results.json', 'r') as f:
        return json.load(f)


def _extract_scores(results: dict) -> pd.DataFrame:
    data = []
    for model, model_data in results['models'].items():
        for temperature, temp_data in model_data.items():
            for question, answers in temp_data.items():
                for answer in answers:
                    data.append({
                        'model': model,
                        'company': _get_company_from_model(model),
                        'temperature': float(temperature),
                        'question': question,
                        'answer_num': answer['answer_num'],
                        'embedding_score': answer['embedding_dissimilarity_score'],
                        'coherence_score': answer['coherence_score']
                    })
    return pd.DataFrame(data)


def _get_best_models(df: pd.DataFrame) -> dict:
    # Hardcoded best models based on latest scores
    best_models = {
        'embedding_score': {
            'openai': 'openai/o1-preview',
            'anthropic': 'anthropic/claude-3.5-sonnet',
            'google': 'google/gemini-pro-1.5',
            'meta': 'meta-llama/llama-3.1-405b-instruct',
            'mistral': 'mistralai/mistral-large-latest',
            'x': 'x-ai/grok-beta'
        },
        'coherence_score': {
            'openai': 'openai/o1-preview',
            'anthropic': 'anthropic/claude-3.5-sonnet',
            'google': 'google/gemini-pro-1.5',
            'meta': 'meta-llama/llama-3.1-405b-instruct',
            'mistral': 'mistralai/mistral-large-latest',
            'x': 'x-ai/grok-beta'
        }
    }
    return best_models


def _remove_answer_count_outliers(df: pd.DataFrame) -> pd.DataFrame:
    answer_counts = df.groupby('question')['answer_num'].max()
    Q1, Q3 = answer_counts.quantile(0.25), answer_counts.quantile(0.75)
    IQR = Q3 - Q1
    normal_questions = answer_counts[
        (answer_counts >= Q1 - 1.5 * IQR) &
        (answer_counts <= Q3 + 1.5 * IQR)
    ].index
    return df[df['question'].isin(normal_questions)]


def _smooth_series(series: pd.Series, window: int = 5) -> pd.Series:
    return series.rolling(window=window, center=True, min_periods=1).mean()


def _plot_company_models(df: pd.DataFrame,
                         company: str,
                         metric: str,
                         title: str,
                         output_path: str):
    plt.figure(figsize=(16, 10))
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.rcParams['font.family'] = 'serif'

    company_df = df[df['company'] == company]
    color = COMPANY_COLORS[company]

    # Add reference line based on metric
    if metric == 'coherence_score':
        plt.axhline(y=15, color='gray', linestyle=':', alpha=0.5)
    elif metric == 'embedding_score':
        plt.axhline(y=0.15, color='gray', linestyle=':', alpha=0.5)

    # Define line styles for differentiation
    line_styles = [
        {'linestyle': 'solid', 'linewidth': 1.5},
        {'linestyle': 'dashed', 'linewidth': 1.5},
        {'linestyle': 'dotted', 'linewidth': 2.0},
        {'linestyle': 'dashdot', 'linewidth': 1.5},
        # Custom dash pattern
        {'linestyle': (0, (5, 2, 1, 2)), 'linewidth': 1.5},
        {'linestyle': (0, (1, 1)), 'linewidth': 1.5},        # Dense dotted
        {'linestyle': (0, (10, 2, 1, 2)), 'linewidth': 1.5},  # Custom dash-dot
        {'linestyle': (0, (5, 1)), 'linewidth': 1.5},        # Dense dashed
    ]

    # Calculate mean values for each model and answer number
    mean_df = company_df.groupby(['model', 'answer_num'])[
        metric].mean().reset_index()

    # Plot each model's trend line with different line styles
    for idx, model in enumerate(sorted(company_df['model'].unique())):
        model_df = mean_df[mean_df['model'] == model].sort_values('answer_num')
        smoothed_values = _smooth_series(model_df[metric])

        # Get line style (cycle through styles if more models than styles)
        style = line_styles[idx % len(line_styles)]

        plt.plot(model_df['answer_num'],
                 smoothed_values,
                 color=color,
                 alpha=0.9,
                 label=model.split('/')[-1],
                 **style)  # Unpack the style dictionary

        # Add model name at end of line
        last_x = model_df['answer_num'].iloc[-1]
        last_y = smoothed_values.iloc[-1]
        plt.annotate(model.split('/')[-1],
                     xy=(last_x, last_y),
                     xytext=(5, 0),
                     textcoords='offset points',
                     va='center',
                     color=color,
                     fontsize=9,
                     fontweight='bold')

    plt.title(f"{title}\n{company.title()}", fontsize=16, pad=20)
    plt.xlabel('Answer Number', fontsize=14)
    plt.ylabel(metric.replace('_', ' ').title(), fontsize=14)

    # Add legend for line styles
    plt.legend(fontsize=9,
               title=f"{company.title()} Models",
               bbox_to_anchor=(1.05, 1),
               loc='upper left')

    plt.grid(True, alpha=0.2)
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()


def _plot_best_models(df: pd.DataFrame,
                      metric: str,
                      best_models: dict,
                      title: str,
                      output_path: str):
    plt.figure(figsize=(16, 10))
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.rcParams['font.family'] = 'serif'

    # Add reference line based on metric
    if metric == 'coherence_score':
        plt.axhline(y=15, color='gray', linestyle=':', alpha=0.5)
    elif metric == 'embedding_score':
        plt.axhline(y=0.15, color='gray', linestyle=':', alpha=0.5)

    # Get models for this metric
    metric_best_models = best_models[metric]

    # Calculate mean values for best models
    mean_df = df[df['model'].isin(metric_best_models.values())].groupby(
        ['model', 'answer_num'])[metric].mean().reset_index()

    # Plot each company's best model
    for company, model in metric_best_models.items():
        color = COMPANY_COLORS[company]
        model_df = mean_df[mean_df['model'] == model].sort_values('answer_num')
        smoothed_values = _smooth_series(model_df[metric])

        plt.plot(model_df['answer_num'],
                 smoothed_values,
                 linewidth=1.5,
                 color=color)

        # Add model name at end of line
        last_x = model_df['answer_num'].iloc[-1]
        last_y = smoothed_values.iloc[-1]
        plt.annotate(model.split('/')[-1],
                     xy=(last_x, last_y),
                     xytext=(5, 0),
                     textcoords='offset points',
                     va='center',
                     color=color,
                     fontsize=9,
                     fontweight='bold')

    plt.title(f"{title}\nBest Model per Company", fontsize=16, pad=20)
    plt.xlabel('Answer Number', fontsize=14)
    plt.ylabel(metric.replace('_', ' ').title(), fontsize=14)

    # Add company legend
    handles = [plt.Line2D([0], [0], color=color, label=company.title(), linewidth=1.5)
               for company, color in COMPANY_COLORS.items()]
    plt.legend(handles=handles,
               title='Companies',
               bbox_to_anchor=(1.05, 1),
               loc='upper left',
               fontsize=10)

    plt.grid(True, alpha=0.2)
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight')
    plt.close()


def _compute_score_statistics(df: pd.DataFrame) -> pd.DataFrame:
    """Compute detailed statistics for scores including CIs and clustered SEs"""
    # First reset the index to avoid multi-index issues
    stats_df = df.groupby(['model', 'question']).agg({
        'embedding_score': ['mean', 'std', 'count'],
        'coherence_score': ['mean', 'std', 'count']
    }).reset_index()

    # Flatten column names
    stats_df.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col
                       for col in stats_df.columns]

    # Compute confidence intervals
    for metric in ['embedding_score', 'coherence_score']:
        stats_df[f'{metric}_ci'] = stats_df.apply(
            lambda row: compute_confidence_interval(
                df[
                    (df['model'] == row['model']) &
                    (df['question'] == row['question'])
                ][metric].values
            ),
            axis=1
        )

        # Add clustered standard errors
        stats_df[f'{metric}_clustered_se'] = stats_df.apply(
            lambda row: compute_clustered_se(
                df[df['model'] == row['model']],
                metric,
                'question'
            ),
            axis=1
        )

    return stats_df


def analyze_questions(df: pd.DataFrame) -> dict:
    """Analyzes questions across multiple dimensions and returns insights"""
    question_stats = {}

    # Calculate mean scores per question
    mean_scores = df.groupby('question').agg({
        'embedding_score': ['mean', 'std'],
        'coherence_score': ['mean', 'std']
    }).round(3)

    # Flatten column names
    mean_scores.columns = ['_'.join(col).strip('_') for col in mean_scores.columns]

    # Find questions with highest/lowest scores
    question_stats['top_coherence'] = mean_scores.nlargest(5, 'coherence_score_mean')['coherence_score_mean']
    question_stats['bottom_coherence'] = mean_scores.nsmallest(5, 'coherence_score_mean')['coherence_score_mean']
    question_stats['top_embedding'] = mean_scores.nlargest(5, 'embedding_score_mean')['embedding_score_mean']
    question_stats['bottom_embedding'] = mean_scores.nsmallest(5, 'embedding_score_mean')['embedding_score_mean']

    # Analyze score variability
    question_stats['most_variable'] = mean_scores.nlargest(5, 'coherence_score_std')['coherence_score_std']
    question_stats['most_consistent'] = mean_scores.nsmallest(5, 'coherence_score_std')['coherence_score_std']

    # Add statistical analysis
    stats_df = _compute_score_statistics(df)
    question_stats['statistics'] = stats_df

    # Compute paired differences between top models
    top_models = df['model'].value_counts().nlargest(5).index
    paired_comparisons = []

    for i, model_a in enumerate(top_models):
        for model_b in top_models[i+1:]:
            comparison = analyze_paired_differences(df, model_a, model_b)
            paired_comparisons.append({
                'model_a': model_a,
                'model_b': model_b,
                **comparison
            })

    question_stats['paired_comparisons'] = pd.DataFrame(paired_comparisons)

    return question_stats


def _plot_question_analysis(df: pd.DataFrame, output_dir: Path):
    """Generates detailed question analysis visualizations"""
    plt.style.use('seaborn-v0_8-whitegrid')
    plt.rcParams['font.family'] = 'serif'

    # Create violin plots for both metrics
    for metric in ['coherence_score', 'embedding_score']:
        plt.figure(figsize=(20, 12))

        # Create violin plot
        sns.violinplot(data=df,
                      x='question',
                      y=metric,
                      cut=0,  # Don't extend beyond data bounds
                      scale='width',  # Make all violins same width
                      inner='box')  # Show box plot inside violin

        # Customize appearance
        plt.xticks(rotation=45, ha='right')
        plt.xlabel('Questions', fontsize=12)
        plt.ylabel(metric.replace('_', ' ').title(), fontsize=12)

        # Truncate question labels if too long
        ax = plt.gca()
        labels = [label.get_text()[:50] + '...' if len(label.get_text()) > 50
                 else label.get_text()
                 for label in ax.get_xticklabels()]
        ax.set_xticklabels(labels)

        # Add summary statistics as text
        question_stats = df.groupby('question')[metric].agg(['mean', 'std', 'min', 'max'])
        for idx, (question, stats) in enumerate(question_stats.iterrows()):
            stats_text = f'μ={stats["mean"]:.2f}\nσ={stats["std"]:.2f}'
            plt.text(idx, plt.ylim()[0], stats_text,
                    ha='center', va='top', fontsize=8, rotation=45)

        plt.title(f'Distribution of {metric.replace("_", " ").title()} by Question',
                 pad=20, fontsize=14)
        plt.grid(True, alpha=0.2)
        plt.tight_layout()

        # Save high-resolution plot
        plt.savefig(output_dir / f'question_{metric}_violin.png',
                   dpi=300, bbox_inches='tight')
        plt.close()

    # Add correlation heatmap between questions
    plt.figure(figsize=(15, 15))
    question_correlations = df.pivot_table(
        values='coherence_score',
        index='question',
        columns='company',
        aggfunc='mean'
    ).corr()

    sns.heatmap(question_correlations,
                annot=True,
                cmap='RdYlBu_r',
                center=0,
                fmt='.2f',
                square=True)
    plt.title('Question Performance Correlation Across Companies', pad=20)
    plt.tight_layout()
    plt.savefig(output_dir / 'question_correlations.png', dpi=300, bbox_inches='tight')
    plt.close()


def generate_question_analysis_report(stats: dict, output_dir: Path):
    """Generates a markdown report with question analysis insights"""
    report = [
        "# Question Analysis Report\n",
        "## Top Performing Questions (Coherence)",
        stats['top_coherence'].to_markdown(),
        "\n## Lowest Performing Questions (Coherence)",
        stats['bottom_coherence'].to_markdown(),
        "\n## Questions with Most Variable Responses",
        stats['most_variable'].to_markdown(),
        "\n## Questions with Most Consistent Responses",
        stats['most_consistent'].to_markdown(),
    ]

    with open(output_dir / 'question_analysis.md', 'w') as f:
        f.write('\n\n'.join(report))


def generate_score_plots():
    output_dir = Path('plots')
    output_dir.mkdir(exist_ok=True)

    results = _load_results()
    df = _extract_scores(results)
    df_filtered = _remove_answer_count_outliers(df)

    # Add question analysis
    question_stats = analyze_questions(df_filtered)
    _plot_question_analysis(df_filtered, output_dir)
    generate_question_analysis_report(question_stats, output_dir)

    # Get best models for each company
    best_models = _get_best_models(df_filtered)

    # Plot company-specific graphs
    for company in COMPANY_COLORS.keys():
        _plot_company_models(
            df_filtered,
            company,
            'embedding_score',
            'Embedding Dissimilarity Score Decay Over Answers',
            output_dir / f'{company}_embedding_decay.png'
        )

        _plot_company_models(
            df_filtered,
            company,
            'coherence_score',
            'Coherence Score Variation Over Answers',
            output_dir / f'{company}_coherence.png'
        )

    # Plot best models comparison
    _plot_best_models(
        df_filtered,
        'embedding_score',
        best_models,
        'Embedding Dissimilarity Score Decay Over Answers',
        output_dir / 'best_models_embedding_decay.png'
    )

    _plot_best_models(
        df_filtered,
        'coherence_score',
        best_models,
        'Coherence Score Variation Over Answers',
        output_dir / 'best_models_coherence.png'
    )


def compute_confidence_interval(data: np.ndarray, confidence: float = 0.95) -> Tuple[float, float]:
    """Compute confidence interval using t-distribution"""
    n = len(data)
    mean = np.mean(data)
    se = stats.sem(data)  # Standard error of mean
    ci = t.interval(confidence, n-1, loc=mean, scale=se)
    return ci


def compute_clustered_se(data: pd.DataFrame,
                        score_col: str,
                        cluster_col: str) -> float:
    """Compute clustered standard errors"""
    # Group by cluster and calculate means
    cluster_means = data.groupby(cluster_col)[score_col].mean()

    # Calculate overall mean
    overall_mean = data[score_col].mean()

    # Calculate clustered variance
    n = len(data)
    n_clusters = len(cluster_means)

    # Sum of squared deviations within clusters
    within_cluster_dev = sum(
        sum((val - cluster_means[cluster])**2
            for val in data[data[cluster_col] == cluster][score_col])
        for cluster in cluster_means.index
    )

    # Between cluster variance
    between_cluster_dev = sum(
        len(data[data[cluster_col] == cluster]) * (mean - overall_mean)**2
        for cluster, mean in cluster_means.items()
    )

    clustered_se = np.sqrt((within_cluster_dev + between_cluster_dev) / (n * (n_clusters - 1)))
    return clustered_se


def analyze_paired_differences(df: pd.DataFrame,
                             model_a: str,
                             model_b: str) -> dict:
    """Analyze paired differences between two models"""
    # Get paired scores
    paired_data = df[df['model'].isin([model_a, model_b])].pivot(
        columns='model',
        values='coherence_score'  # Using coherence_score for comparison
    )

    differences = paired_data[model_a] - paired_data[model_b]

    # Calculate statistics
    mean_diff = differences.mean()
    se = stats.sem(differences)
    ci = t.interval(0.95, len(differences)-1, loc=mean_diff, scale=se)
    t_stat, p_value = stats.ttest_rel(paired_data[model_a], paired_data[model_b])

    return {
        'mean_difference': mean_diff,
        'standard_error': se,
        'confidence_interval': ci,
        't_statistic': t_stat,
        'p_value': p_value
    }


if __name__ == "__main__":
    generate_score_plots()