From 743b423ac98a28a25ac9eecdf462263e8f9ac4d8 Mon Sep 17 00:00:00 2001 From: Debaditya Das <153950049+DEBADAS001KERNEL@users.noreply.github.com> Date: Fri, 4 Apr 2025 19:37:25 +0530 Subject: [PATCH 1/2] MMLU script require #36 I have added two codes: one using pipeline and another using AutoModel Encoder. I have loaded all the MMLU data, chosen 100 samples, and added a simple logic to check accuracy. It's under main -> examples -> mmlu_ev with two files: MMLU_EV_PIPELINE MMLU_EV_AUTOMODEL --- examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY | 45 ++++++++++++++++++++++++++ examples/mmlu_ev/MMLU_EV_pipeline.PY | 46 +++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY create mode 100644 examples/mmlu_ev/MMLU_EV_pipeline.PY diff --git a/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY b/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY new file mode 100644 index 00000000..aa229413 --- /dev/null +++ b/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY @@ -0,0 +1,45 @@ +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch +dataset = load_dataset("cais/mmlu", "all") +test_set = dataset["test"] + + +modelname = "meta-llama/Llama-2-7b-chat-hf" +tokenizer = AutoTokenizer.from_pretrained(modelname) + +model = AutoModelForCausalLM.from_pretrained(modelname, torch_dtype=torch.float16, device_map="auto") + + +def ANS_GEN(q, choices): # ans + + write= f" Question: {q}\nChoices: {', '.join(choices)}" + + inputs = tokenizer(write, return_tensors="pt").to("cuda") + + outputs = model.generate(**inputs, max_length=300) + output = tokenizer.decode(outputs[0], skip_special_tokens=True) + + + for choice in choices: + if choice in output: + return choice + return None + + +correct = 0 +total = 0 + +for sample in test_set.select(range(100)): # 100 SAMPLES + Q = sample["question"] + choices = sample["choices"] + original = choices[sample["answer"]] + + modelanswer = ANS_GEN(Q, choices) + + if modelanswer == original: + correct += 1 + total += 1 + +accuracy = (correct / total) * 100 +print(f" Model Accuracy {accuracy:.2f}%") diff --git a/examples/mmlu_ev/MMLU_EV_pipeline.PY b/examples/mmlu_ev/MMLU_EV_pipeline.PY new file mode 100644 index 00000000..3023933c --- /dev/null +++ b/examples/mmlu_ev/MMLU_EV_pipeline.PY @@ -0,0 +1,46 @@ +from datasets import load_dataset +from transformers import pipeline +data_set = load_dataset("cais/mmlu", "all") # load the dataset + +Test_set = data_set["test"] + +pipe = pipeline("text-generation", model="google/gemma-3-27b-it") # choose the 27 b model + + +import random + +sampales=random.sample(list(Test_set),100) # taking 100 samples form the data set . + +''' +example of the dat set : + +{ + "question": "What is the embryological origin of the hyoid bone?", + "choices": ["The first pharyngeal arch", "The first and second pharyngeal arches", "The second pharyngeal arch", "The second and third pharyngeal arches"], + "answer": "D" +} + + + +''' +for sample in sampales: + q=sample["question"] + choices=sample["choices"] + original = choices[sample["answer"]] + +write= f" Question: {q}\nChoices: {', '.join(choices)}" + +ans= pipe(write, max_length=100, do_sample=False)[0]["generated_text"] + +print(f"Q: {q}") +print(f"Choices: {choices}") +print(f"LLM Response: {ans}") +print(f"Correct Answer: {original}") +correct=0 +total=0 +if ans== original: + correct +=1 +total += 1 +accuracy = (correct / total) * 100 + +print(f"accuracy: {accuracy: 2f}%") From 525ce26317ada69b2dcf3fb971cecfbc61fe2331 Mon Sep 17 00:00:00 2001 From: Debaditya Das <153950049+DEBADAS001KERNEL@users.noreply.github.com> Date: Fri, 4 Apr 2025 20:07:47 +0530 Subject: [PATCH 2/2] MMLU,script for HF I have added two codes: one using pipeline and another using AutoModel Encoder. I have loaded all the MMLU data, chosen 100 samples, and added a simple logic to check accuracy. It's under main -> examples -> mmlu_ev with two files: MMLU_EV_PIPELINE MMLU_EV_AUTOMODEL --- examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY | 2 +- examples/mmlu_ev/MMLU_EV_pipeline.PY | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY b/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY index aa229413..c0ffaad4 100644 --- a/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY +++ b/examples/mmlu_ev/MMLU_EV_AUTOMODEL.PY @@ -18,7 +18,7 @@ def ANS_GEN(q, choices): # ans inputs = tokenizer(write, return_tensors="pt").to("cuda") outputs = model.generate(**inputs, max_length=300) - output = tokenizer.decode(outputs[0], skip_special_tokens=True) + output = tokenizer.decode(outputs[0], skip_special_tokens=True)# for choice in choices: diff --git a/examples/mmlu_ev/MMLU_EV_pipeline.PY b/examples/mmlu_ev/MMLU_EV_pipeline.PY index 3023933c..966ed730 100644 --- a/examples/mmlu_ev/MMLU_EV_pipeline.PY +++ b/examples/mmlu_ev/MMLU_EV_pipeline.PY @@ -9,7 +9,7 @@ pipe = pipeline("text-generation", model="google/gemma-3-27b-it") # choose the 2 import random -sampales=random.sample(list(Test_set),100) # taking 100 samples form the data set . +sampales=random.sample(list(Test_set),100) # taking 100 samples form the data set .. ''' example of the dat set :