-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.py
251 lines (212 loc) · 9.89 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import os
import json
import argparse
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.func import run, BENCHMARK_NAME_MAPPING, MODEL_NAME_MAPPING
def parse_args():
parser = argparse.ArgumentParser(description='Run the benchmark.')
parser.add_argument('--context_associations', help='Run the in-context associations benchmark.', action='store_true')
parser.add_argument('--multimodal_bindings', help='Run the multimodal bindings benchmark.', action='store_true')
parser.add_argument('--program_synthesis', help='Run the program synthesis benchmark.', action='store_true')
parser.add_argument('--logic_components', help='Run the logic components benchmark.', action='store_true')
parser.add_argument('--computation_graphs', help='Run the computation graphs benchmark.', action='store_true')
parser.add_argument('--all', help='Run all benchmarks.', action='store_true')
parser.add_argument('--dummy', help='Run the dummy benchmark.', action='store_true')
parser.add_argument('--verbose', help='Print additional information.', action='store_true')
parser.add_argument('--models', help='Run the specified models.', default=['all'], type=str, nargs='+')
parser.add_argument('--seeds', help='Run the specified seeds.', default=['all'], type=int, nargs='+')
parser.add_argument('--tests', help='Run only specific tests.', default=['all'], type=str, nargs='+')
parser.add_argument('--plot', help='Plot the results.', type=str)
return parser.parse_args()
LATEX_TEMPLATE = """
\\begin{{figure*}}[ht]
\\centering
\\begin{{minipage}}{{.6\\textwidth}}
\\centering
\\begin{{adjustbox}}{{max width=\\linewidth}}
\\begin{{tabular}}{{lccccccccc}}
\\toprule
\\textbf{{Benchmarks}} & {model_names} \\\\
\\midrule
{benchmark_in_context_association_row} \\\\
{benchmark_multimodality_row} \\\\
{benchmark_program_synthesis_row} \\\\
{benchmark_components_row} \\\\
{benchmark_computation_graphs_row} \\\\
\\midrule
\\textbf{{Total}} & {total_row} \\\\
\\bottomrule
\\end{{tabular}}
\\end{{adjustbox}}
\\label{{tab:benchmark_results}}
\\end{{minipage}}%
~
\\begin{{minipage}}{{.4\\textwidth}}
\\centering
\\begin{{adjustbox}}{{max width=\\linewidth}}
\\includegraphics[width=\\linewidth]{{images/benchmark_comparison_chart.pdf}}
\\end{{adjustbox}}
\\label{{fig:spider_plot}}
\\end{{minipage}}
\\caption{{Placeholder for performance benchmarks and comparison chart for various models.}}
\\label{{fig:my_label}}
\\end{{figure*}}
"""
def sort_by_name(model):
if 'GPT-4' in model:
return 0
elif 'GPT-3.5' in model:
return 1
elif 'Random' in model:
return 2
elif 'LlaMA' in model:
return 3
elif 'Mistral' in model:
return 4
elif 'Zephyr' in model:
return 5
elif 'Gemini' in model:
return 6
else:
return 7
def sort_items_by_name(model):
return sort_by_name(model[0])
remap_name = {
'GPT-4 Turbo': 'GPT-4 Turbo',
'GPT-3.5 Turbo': 'GPT-3.5 Turbo',
'Gemini 1.0 Pro': 'Gemini 1.0 Pro',
'Gemini 1.5 Pro': 'Gemini 1.5 Pro',
'LlaMA 2 13B': 'LlaMA 2 13B',
'LlaMA 3 8B': 'LlaMA 3 8B',
'LlaMA 3 70B': 'LlaMA 3 70B',
'Zephyr 7B': 'Zephyr 7B',
'Mistral 7B': 'Mistral 7B',
'Random': 'Random'
}
def create_latex_result(data):
# Define the directory and file name
directory = 'tmp'
# make sure the directory exists
os.makedirs(directory, exist_ok=True)
filename = 'benchmark_results.tex'
filepath = os.path.join(directory, filename)
# Gather the model names
data_model_names = list(data.keys())
# Sort the models by name
data_model_names.sort(key=sort_by_name)
model_names = " & ".join(remap_name[key] for key in data_model_names)
# Initialize the total scores
total_scores = {model: 0.0 for model in data_model_names}
# Prepare table content
benchmark_rows = {bench_name: "" for bench_name in BENCHMARK_NAME_MAPPING.values()}
for bench_name in BENCHMARK_NAME_MAPPING.values():
if bench_name not in str(list(data.values())):
print(f"Skipping benchmark because not all results are computed. Did not find `{bench_name}` in `{data.keys()}`")
return
# Initialize list to keep the scores for this benchmark to find the best model
scores = [(model, np.mean([np.mean(run['scores']) for run in values[bench_name]['runs']])) for model, values in data.items()]
# sort the scores by name following this order: GPT-4, GPT-3.5, Gemini-Pro, LlaMA 2, Mistral, Zephyr, Random
# write custom sorting function to sort by name
scores.sort(key=sort_items_by_name)
best_score = max(scores, key=lambda x: x[1])[1]
# Create row for the latex table and update the total scores
row = f"{bench_name}"
for model, score in scores:
# Add to the total score
total_scores[model] += score
# Format row with best model in bold
if score == best_score:
row += f" & \\textbf{{{score:.2f}}}"
else:
row += f" & {score:.2f}"
benchmark_rows[bench_name] = row
# Compute the average of total scores
for model in total_scores.keys():
total_scores[model] /= len(BENCHMARK_NAME_MAPPING)
# Best total performance in bold
best_total = max(total_scores.values())
total_values = " & ".join(f"\\textbf{{{v:.2f}}}" if v == best_total else f"{v:.2f}" for v in total_scores.values())
# Use the LATEX_TEMPLATE and inject the benchmark rows
latex_table = LATEX_TEMPLATE.format(
model_names=model_names,
benchmark_in_context_association_row=benchmark_rows[BENCHMARK_NAME_MAPPING['eval_in_context_associations']],
benchmark_multimodality_row=benchmark_rows[BENCHMARK_NAME_MAPPING['eval_multimodal_bindings']],
benchmark_program_synthesis_row=benchmark_rows[BENCHMARK_NAME_MAPPING['eval_program_synthesis']],
benchmark_components_row=benchmark_rows[BENCHMARK_NAME_MAPPING['eval_logic_components']],
benchmark_computation_graphs_row=benchmark_rows[BENCHMARK_NAME_MAPPING['eval_computation_graphs']],
total_row=total_values
)
# Print the latex table to the console
print(latex_table)
# Save the latex table to a file
if not os.path.exists(directory):
os.makedirs(directory)
with open(filepath, 'w') as f:
f.write(latex_table)
print(f"LaTeX table saved to {filepath}")
def create_plot(data):
# Define the categories and models
categories = list(next(iter(data.values())).keys()) # Assuming all models have the same structure
models = list(data.keys())
N = len(categories)
# Prepare data for plotting
values = [list(d.values()) for d in data.values()]
values = [[np.mean([np.mean(run['scores']) for run in v['runs']]) for v in sublist] for sublist in values]
values = np.array(values)
# Create a radar chart
angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
values = np.concatenate((values, values[:,[0]]), axis=1) # Repeat the first value to close the circle
angles += angles[:1]
fig, ax = plt.subplots(figsize=(8, 8), subplot_kw=dict(polar=True))
sns.set_theme(context='paper', style='whitegrid')
def add_to_radar(values, model_name, color):
if model_name == MODEL_NAME_MAPPING['random']:
val = np.max(values) # Use the maximum value to draw a circle for the random model
angles_circle = np.linspace(0, 2 * np.pi, 100) # Use 100 points to make a smooth circle
ax.plot(angles_circle, np.full_like(angles_circle, val), '--', linewidth=2, color=color, label=model_name)
ax.fill(angles_circle, np.full_like(angles_circle, val), color=color, alpha=0.25)
else:
ax.plot(angles, values, color=color, linewidth=2, label=model_name)
ax.fill(angles, values, color=color, alpha=0.25)
colors = [ax._get_lines.get_next_color() for _ in range(len(models))]
zippped = zip(values, models, colors)
# sort based on name
zippped = sorted(zippped, key=lambda x: sort_by_name(x[1]))
# Add each model to the radar chart
for values, model_name, color in zippped:
model_name = remap_name[model_name]
add_to_radar(values, model_name, color)
# Add labels to the plot with increased label padding
label_padding = 1.1 # Adjust label padding as needed
ax.set_theta_offset(np.pi / 2)
ax.set_theta_direction(-1)
# Define font size
label_font_size = 15 # Choose desired font size
ax.set_thetagrids(np.degrees(angles[:-1]), categories, fontsize=label_font_size)
for label, angle in zip(ax.get_xticklabels(), angles):
if angle in (0, np.pi):
label.set_horizontalalignment('center')
elif 0 < angle < np.pi:
label.set_horizontalalignment('left')
else:
label.set_horizontalalignment('right')
label.set_position((label_padding, label.get_position()[1]))
# Increase the font size for the legend
plt.legend(loc='upper right', ncol=4, bbox_to_anchor=(1.38, 1.3, 0, 0), borderaxespad=0., fontsize=label_font_size)
# Set tight layout
plt.tight_layout()
# Save as PDF
plt.savefig("tmp/benchmark_comparison_chart.pdf", format="pdf", bbox_inches='tight')
# Show the plot
plt.show()
if __name__ == '__main__':
args = parse_args()
if args.plot is None:
results = run(args)
else:
with open(args.plot, 'r') as f:
results = json.load(f)
create_latex_result(results)
create_plot(results)