-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathtest_exebench.py
More file actions
115 lines (98 loc) · 4.55 KB
/
test_exebench.py
File metadata and controls
115 lines (98 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""Run all exebench tests
"""
import argparse
import json
import sys
import subprocess
import itertools
from pathlib import Path
RESULTS_DIR = Path("results")
OVERWRITE = True
PARTITION = ("test",)
SUBPARTITION = ("real",)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--overwrite", action="store_true", help="If exebench results already exist, run the tests again and overwrite the existing scores.")
parser.add_argument("--dry-run", action="store_true", help="Only show the commands, don't actually run them.")
parser.add_argument("--exclude", type=str, help="Exclude commands with this string in their name.")
parser.add_argument("--require", type=str, help="Require commands to have this string in their name.")
parser.add_argument("--batches", type=int, help="Generate bash scripts that correspond to one batch each.")
return parser.parse_args()
# python evaluator.py runs/codegemma-2b-exebench-O0/ --dataset exebench-hf-O0-eval/ --eval-partition test --exebench-subpartition synth --batch-size 8 --no-exebench-tests
def make_command(run_name: str, checkpoint: str | None = None, dataset: str | None = None, eval_partition: str | None = None, exebench_subpartition: str | None = None) -> list[str]:
command = ["python", "evaluator.py"]
if checkpoint is None:
command.append(f"runs/{run_name}")
else:
command.append(f"runs/{run_name}/{checkpoint}")
command.append("--evaluate-existing-predictions")
if dataset is not None:
command.append("--dataset")
command.append(dataset)
if eval_partition is not None:
command.append("--eval-partition")
command.append("validation" if eval_partition == "valid" else eval_partition)
if exebench_subpartition is not None:
command.append("--exebench-subpartition")
command.append(exebench_subpartition)
return command
def eligible_for_exebench_tests(directory: Path, prefix: str, overwrite: bool) -> bool:
if not (directory / f"{prefix}_results.json").exists():
return False # Can only run the tests if there are predictions to run them on.
if overwrite:
return True
try:
with open(directory / f"{prefix}_scores.json", "r") as fp:
scores = json.load(fp)
except FileNotFoundError:
print(f"WARNING: results file exists for {directory} but scores file does not.", file=sys.stderr)
return True
return not any("exebench" in metric for metric in scores)
def main():
args = get_args()
overwrite: bool = args.overwrite
dry_run: bool = args.dry_run
exclude: str | None = args.exclude
require: str | None = args.require
batches: int | None = args.batches
with open("results/best_checkpoints.json", "r") as fp:
best_checkpoints: dict[str, str] = json.load(fp)
commands = []
for rundir in RESULTS_DIR.iterdir():
if (exclude is not None and exclude in rundir.name) or \
(require is not None and require not in rundir.name):
continue
scores_dir = rundir
if "exebench-O0" in rundir.name:
if "parity" in rundir.name or rundir.name in best_checkpoints:
if rundir.name in best_checkpoints:
checkpoint = best_checkpoints[rundir.name]
scores_dir = scores_dir / checkpoint / "exebench-hf-O0-eval"
else:
continue
else:
checkpoint = None
scores_dir = scores_dir / "exebench-hf-O0-eval"
for partition, subpartition in itertools.product(PARTITION, SUBPARTITION):
if eligible_for_exebench_tests(scores_dir, f"{partition}_{subpartition}", overwrite):
commands.append(make_command(rundir.name, checkpoint, "exebench-hf-O0-eval", partition, subpartition))
commands.sort(key=lambda x: x[2])
print(f"Commands to be run:")
for command in commands:
print(" ".join(command))
print(f"{len(commands)} total commands.")
print()
if batches is not None:
batch_size = int(len(commands) / batches)
for i in range(batches):
with open(f"batch{i}_exebench.sh", "w") as fp:
for command in commands[(i * batch_size):((i + 1) * batch_size)]:
fp.write(" ".join(command) + "\n")
elif not dry_run:
for i, command in enumerate(commands):
print(f"Running command {i}/{len(commands)}")
print(" ".join(command))
subprocess.run(command)
print("\n\n")
if __name__ == "__main__":
main()