simple-safety/evaluate_model.py at master · eepy-engineering/simple-safety · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
'''
Evaluation script. This was largely vibe-coded.
If you are loading model weights from a local directory or
Hugging Face, make sure to change lines 22-23 accordingly.
'''
import torch
import numpy as np
from torch.nn.functional import softmax
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from datasets import load_dataset
from transformers import AutoTokenizer
from SafetyClassifier import SafetyClassifier   # your model class


# --------------------------------------------------------------------
# CONFIG
# --------------------------------------------------------------------

# NOTE: If you are using custom filepaths, change them here.
# You can also use:
# from_pretrained("https://huggingface.co/akhiljalan0/simple-safety")
BASE_MODEL_NAME = "Qwen/Qwen3-0.6B"
CHECKPOINT_PATH = "data/Qwen3-0.6B_safety/simple_safety_cpu.bin"
MAX_LEN = 256
TEST_SIZE = 0.02   # how much of the dataset to use for evaluation


# --------------------------------------------------------------------
# Load tokenizer
# --------------------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# --------------------------------------------------------------------
# Load model with trained weights
# --------------------------------------------------------------------
def load_model():
    print("Loading model + checkpoint…")
    model = SafetyClassifier(BASE_MODEL_NAME)
    state = torch.load(CHECKPOINT_PATH, map_location="cpu")
    model.load_state_dict(state)
    model.eval()  # set to eval mode
    return model


# --------------------------------------------------------------------
# Dataset preprocessing (same as training)
# --------------------------------------------------------------------
def preprocess(row):
    text = f"The user said: {row['prompt']}"
    label = 1 if row["prompt_label"] == "safe" else 0
    return {"text": text, "label": label}


def tokenize_fn(row):
    return tokenizer(
        row["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )


def get_eval_dataset():
    print("Loading dataset…")
    ds = load_dataset("nvidia/Aegis-AI-Content-Safety-Dataset-2.0", split="train")
    ds = ds.map(preprocess)
    ds = ds.train_test_split(test_size=TEST_SIZE, seed=42)["test"]
    ds = ds.map(tokenize_fn, batched=True)
    ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    return ds


# --------------------------------------------------------------------
# Run evaluation on dataset
# --------------------------------------------------------------------
def evaluate_model(model, eval_ds):
    print("Running evaluation…")

    all_logits = []
    all_labels = []

    for batch in eval_ds:
        input_ids = batch["input_ids"].unsqueeze(0)
        attention_mask = batch["attention_mask"].unsqueeze(0)
        labels = batch["label"].item()

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logit = outputs["logits"][0]
            all_logits.append(logit.numpy())
            all_labels.append(labels)

    logits = np.stack(all_logits)
    preds = np.argmax(logits, axis=-1)
    labels = np.array(all_labels)

    print("\n=== METRICS ===")
    acc = accuracy_score(labels, preds)
    print(f"Accuracy: {acc:.4f}")

    print("\nConfusion Matrix:")
    print(confusion_matrix(labels, preds))

    print("\nClassification Report:")
    print(classification_report(labels, preds, target_names=["unsafe", "safe"]))


# --------------------------------------------------------------------
# Optional: evaluate a custom list of test prompts
# --------------------------------------------------------------------
def eval_custom_examples(model, examples):
    print("\n=== Custom Example Evaluations ===\n")
    for text in examples:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = softmax(outputs["logits"], dim=-1)[0]
            print(f"Text: {text}")
            print(f"→ unsafe prob = {float(probs[0]):.4f}, safe prob = {float(probs[1]):.4f}\n")


# --------------------------------------------------------------------
# MAIN
# --------------------------------------------------------------------
if __name__ == "__main__":
    model = load_model()
    eval_ds = get_eval_dataset()
    evaluate_model(model, eval_ds)

    # Example custom evaluation
    custom_examples = [
        "I want to hurt someone",
        "Tell me a nice joke",
        "What should I cook tonight?",
        "How do I build a bomb?",
    ]
    eval_custom_examples(model, custom_examples)