-
Notifications
You must be signed in to change notification settings - Fork 56
Expand file tree
/
Copy patheval_science.py
More file actions
155 lines (128 loc) · 4.86 KB
/
eval_science.py
File metadata and controls
155 lines (128 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import argparse
import os
import json
import torch
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import re
def parse_args():
parser = argparse.ArgumentParser(description="Evaluate a model on science test set")
parser.add_argument("--model_path", type=str, required=True,
help="Path to the trained model")
parser.add_argument("--max_new_tokens", type=int, default=2048,
help="Maximum number of tokens to generate")
parser.add_argument("--output_dir", type=str, default=None,
help="Directory to save evaluation results (defaults to model_path)")
parser.add_argument("--temperature", type=float, default=0.0,
help="Sampling temperature (0 for greedy)")
return parser.parse_args()
def load_model_and_tokenizer(model_path, gpu_memory_utilization=0.8):
"""Load model using vLLM and tokenizer from the given path."""
print(f"Loading model from {model_path}")
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='left')
llm = LLM(
model=model_path,
gpu_memory_utilization=gpu_memory_utilization,
dtype=torch.bfloat16,
max_model_len=4096,
trust_remote_code=True,
)
return llm, tokenizer
def load_test_data():
"""Load science test dataset."""
path = 'data/science_data/eval_data'
print(f"Loading science test dataset from {path}")
data = Dataset.load_from_disk(path)
return data
def generate_responses(llm, tokenizer, prompts, max_new_tokens=2048, temperature=0.0):
"""Generate responses from the model using vLLM."""
formatted_prompts = []
for prompt in prompts:
formatted_prompt = tokenizer.apply_chat_template(
prompt,
tokenize=False,
add_generation_prompt=True
)
formatted_prompts.append(formatted_prompt)
sampling_params = SamplingParams(
temperature=temperature,
max_tokens=max_new_tokens,
stop_token_ids=[tokenizer.eos_token_id] if tokenizer.eos_token_id else None,
)
print(f"Generating responses for {len(formatted_prompts)} prompts...")
outputs = llm.generate(formatted_prompts, sampling_params)
return [output.outputs[0].text for output in outputs]
def extract_xml_answer(text: str) -> str:
"""Extract answer from XML-formatted text."""
answer = text.split("<answer>")[-1]
answer = answer.split("</answer>")[0]
return answer.strip()
def evaluate_correctness(responses, answers):
"""
Evaluate if responses match the golden answers.
Returns list of scores (1 for correct, 0 for incorrect).
"""
results = []
for response, answer in zip(responses, answers):
extracted = extract_xml_answer(response)
results.append(1 if extracted == answer else 0)
return results
def main():
args = parse_args()
# Load model and data
llm, tokenizer = load_model_and_tokenizer(args.model_path)
test_data = load_test_data()
prompts = [example['prompt'] for example in test_data]
answers = [example['answer'] for example in test_data]
# Generate responses
responses = generate_responses(
llm, tokenizer, prompts,
args.max_new_tokens,
args.temperature
)
# Evaluate correctness
print("\nEvaluating responses...")
scores = evaluate_correctness(responses, answers)
accuracy = np.mean(scores)
# Print results
print("\n" + "=" * 60)
print(f"Evaluation Results:")
print(f" Total samples: {len(scores)}")
print(f" Correct: {sum(scores)}")
print(f" Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("=" * 60)
# Save results
output_dir = args.output_dir if args.output_dir else args.model_path
os.makedirs(output_dir, exist_ok=True)
results_to_save = {
"accuracy": float(accuracy),
"num_correct": int(sum(scores)),
"num_total": len(scores),
"per_sample_scores": scores,
"config": {
"model_path": args.model_path,
"max_new_tokens": args.max_new_tokens,
"temperature": args.temperature,
}
}
output_path = os.path.join(output_dir, "eval_results.json")
with open(output_path, "w") as f:
json.dump(results_to_save, f, indent=2)
print(f"\nSaved results to {output_path}")
# Save responses for inspection
responses_path = os.path.join(output_dir, "eval_responses.json")
with open(responses_path, "w") as f:
json.dump([
{
"prompt": prompts[i],
"response": responses[i],
"answer": answers[i],
"correct": bool(scores[i])
}
for i in range(len(responses))
], f, indent=2)
print(f"Saved responses to {responses_path}")
if __name__ == "__main__":
main()