From 743b423ac98a28a25ac9eecdf462263e8f9ac4d8 Mon Sep 17 00:00:00 2001
From: Debaditya Das <153950049+DEBADAS001KERNEL@users.noreply.github.com>
Date: Fri, 4 Apr 2025 19:37:25 +0530
Subject: [PATCH 1/2] MMLU script require #36

I have added two codes: one using pipeline and another using AutoModel Encoder.

I have loaded all the MMLU data, chosen 100 samples, and added a simple logic to check accuracy.

It's under main -> examples -> mmlu_ev with two files:

MMLU_EV_PIPELINE

MMLU_EV_AUTOMODEL
---
 examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY | 45 ++++++++++++++++++++++++++
 examples/mmlu_ev/MMLU_EV_pipeline.PY  | 46 +++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY
 create mode 100644 examples/mmlu_ev/MMLU_EV_pipeline.PY

diff --git a/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY b/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY
new file mode 100644
index 00000000..aa229413
--- /dev/null
+++ b/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY
@@ -0,0 +1,45 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+dataset = load_dataset("cais/mmlu", "all")
+test_set = dataset["test"]
+
+
+modelname = "meta-llama/Llama-2-7b-chat-hf"
+tokenizer = AutoTokenizer.from_pretrained(modelname)
+
+model = AutoModelForCausalLM.from_pretrained(modelname, torch_dtype=torch.float16, device_map="auto")
+
+
+def ANS_GEN(q, choices): # ans
+
+    write= f" Question: {q}\nChoices: {', '.join(choices)}"
+
+    inputs = tokenizer(write, return_tensors="pt").to("cuda")
+
+    outputs = model.generate(**inputs, max_length=300)
+    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+
+    for choice in choices:
+        if choice in output:
+            return choice
+    return None
+
+
+correct = 0
+total = 0
+
+for sample in test_set.select(range(100)):  # 100 SAMPLES
+    Q = sample["question"]
+    choices = sample["choices"]
+    original = choices[sample["answer"]]
+
+    modelanswer = ANS_GEN(Q, choices)
+
+    if modelanswer == original:
+        correct += 1
+    total += 1
+
+accuracy = (correct / total) * 100
+print(f" Model Accuracy  {accuracy:.2f}%")
diff --git a/examples/mmlu_ev/MMLU_EV_pipeline.PY b/examples/mmlu_ev/MMLU_EV_pipeline.PY
new file mode 100644
index 00000000..3023933c
--- /dev/null
+++ b/examples/mmlu_ev/MMLU_EV_pipeline.PY
@@ -0,0 +1,46 @@
+from datasets import load_dataset
+from transformers import pipeline
+data_set = load_dataset("cais/mmlu", "all") #   load the dataset
+
+Test_set = data_set["test"]
+
+pipe = pipeline("text-generation", model="google/gemma-3-27b-it") # choose the 27 b model
+
+
+import random
+
+sampales=random.sample(list(Test_set),100) # taking 100 samples form the data set .
+
+'''
+example of the dat set :
+
+{
+  "question": "What is the embryological origin of the hyoid bone?",
+  "choices": ["The first pharyngeal arch", "The first and second pharyngeal arches", "The second pharyngeal arch", "The second and third pharyngeal arches"],
+  "answer": "D"
+}
+
+
+
+'''
+for sample in sampales:
+  q=sample["question"]
+  choices=sample["choices"]
+  original = choices[sample["answer"]]
+
+write= f" Question: {q}\nChoices: {', '.join(choices)}"
+
+ans= pipe(write, max_length=100, do_sample=False)[0]["generated_text"]
+
+print(f"Q: {q}")
+print(f"Choices: {choices}")
+print(f"LLM Response: {ans}")
+print(f"Correct Answer: {original}")
+correct=0
+total=0
+if ans== original:
+   correct +=1
+total += 1
+accuracy = (correct / total) * 100
+
+print(f"accuracy: {accuracy: 2f}%")

From 525ce26317ada69b2dcf3fb971cecfbc61fe2331 Mon Sep 17 00:00:00 2001
From: Debaditya Das <153950049+DEBADAS001KERNEL@users.noreply.github.com>
Date: Fri, 4 Apr 2025 20:07:47 +0530
Subject: [PATCH 2/2] MMLU,script for HF

I have added two codes: one using pipeline and another using AutoModel Encoder.

I have loaded all the MMLU data, chosen 100 samples, and added a simple logic to check accuracy.

It's under main -> examples -> mmlu_ev with two files:

MMLU_EV_PIPELINE

MMLU_EV_AUTOMODEL
---
 examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY | 2 +-
 examples/mmlu_ev/MMLU_EV_pipeline.PY  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY b/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY
index aa229413..c0ffaad4 100644
--- a/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY
+++ b/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY
@@ -18,7 +18,7 @@ def ANS_GEN(q, choices): # ans
     inputs = tokenizer(write, return_tensors="pt").to("cuda")
 
     outputs = model.generate(**inputs, max_length=300)
-    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    output = tokenizer.decode(outputs[0], skip_special_tokens=True)#
 
 
     for choice in choices:
diff --git a/examples/mmlu_ev/MMLU_EV_pipeline.PY b/examples/mmlu_ev/MMLU_EV_pipeline.PY
index 3023933c..966ed730 100644
--- a/examples/mmlu_ev/MMLU_EV_pipeline.PY
+++ b/examples/mmlu_ev/MMLU_EV_pipeline.PY
@@ -9,7 +9,7 @@ pipe = pipeline("text-generation", model="google/gemma-3-27b-it") # choose the 2
 
 import random
 
-sampales=random.sample(list(Test_set),100) # taking 100 samples form the data set .
+sampales=random.sample(list(Test_set),100) # taking 100 samples form the data set ..
 
 '''
 example of the dat set :