diff --git a/.gitignore b/.gitignore
index 3ee7b311c..5fddf5044 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,5 @@ __pycache__
 .ipynb_checkpoints
 wandb/
 artifacts/
+
+**/.env
\ No newline at end of file
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/.gitignore b/end-to-end-use-cases/benchmarks/llm_eval_harness/.gitignore
new file mode 100644
index 000000000..5cc9e3fea
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/.gitignore
@@ -0,0 +1,2 @@
+**/eval_results/**
+**/old_eval_results/**
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
index c16877839..6d17e5fdf 100644
--- a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/eval_config.yaml
@@ -1,9 +1,9 @@
-model_name: "meta-llama/Llama-3.1-8B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
+model_name: "meta-llama/Llama-3.3-70B-Instruct" # The name of the model to evaluate. This must be a valid Meta Llama 3 based model name in the HuggingFace model hub."
 
-evals_dataset: "meta-llama/Llama-3.1-8B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
+evals_dataset: "meta-llama/Llama-3.1-70B-Instruct-evals" # The name of the 3.1 evals dataset to evaluate, please make sure this eval dataset corresponds to the model loaded. This must be a valid dataset name in the Llama 3.x Evals collection.
 # Must be one of the following ["meta-llama/Llama-3.1-8B-Instruct-evals","meta-llama/Llama-3.1-70B-Instruct-evals","meta-llama/Llama-3.1-405B-Instruct-evals","meta-llama/Llama-3.1-8B-evals","meta-llama/Llama-3.1-70B-evals","meta-llama/Llama-3.1-405B-evals","meta-llama/Llama-3.2-1B-evals","meta-llama/Llama-3.2-3B-evals", "meta-llama/Llama-3.2-1B-Instruct-evals", "meta-llama/Llama-3.2-3B-Instruct-evals"]
 
-tasks: "meta_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
+tasks: "meta_mmlu_pro_instruct" # Available tasks for 3.1 instruct model: "meta_math_hard", "meta_gpqa_cot", "meta_mmlu_pro_instruct", "meta_ifeval"; or just use "meta_instruct" to run all of them.
 # Available tasks for 3.1 pretrain model: "meta_bbh", "meta_mmlu_pro_pretrain"; or just use "meta_pretrain" to run all of them.
 # Available tasks for 3.2 instruct model: "meta_mmlu", "meta_math", "meta_gpqa"; or just use "meta_instruct" to run all of them.
 # Available tasks for 3.2 pretrain model: "meta_mmlu"; or just use "meta_pretrain" to run all of them
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/bbh_3shot_cot.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/bbh_3shot_cot.yaml
new file mode 100644
index 000000000..42b52f16b
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/bbh_3shot_cot.yaml
@@ -0,0 +1,28 @@
+dataset_path: meta-llama/Llama-3.1-70B-evals
+dataset_name: Llama-3.1-70B-evals__bbh__details
+task: meta_bbh
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: answer
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: 'the answer is (.*?)\.'
+      - function: "take_first"
+generation_kwargs:
+  until: "\n\nQ: "
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/utils.py b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/utils.py
new file mode 100644
index 000000000..1e1ed449c
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/bbh/utils.py
@@ -0,0 +1,21 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "answer": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/gpqa_0shot.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/gpqa_0shot.yaml
new file mode 100644
index 000000000..2e7ccd806
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/gpqa_0shot.yaml
@@ -0,0 +1,29 @@
+dataset_path: meta-llama/Llama-3.1-70B-Instruct-evals
+dataset_name: Llama-3.1-70B-Instruct-evals__gpqa__details
+task: meta_gpqa
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: ' ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 2048
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/utils.py b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/utils.py
new file mode 100644
index 000000000..7809ba8f6
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa/utils.py
@@ -0,0 +1,19 @@
+import random
+import re
+
+import datasets
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/gpqa_0shot_cot.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/gpqa_0shot_cot.yaml
new file mode 100644
index 000000000..693593be1
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/gpqa_0shot_cot.yaml
@@ -0,0 +1,29 @@
+dataset_path: meta-llama/Llama-3.1-70B-Instruct-evals
+dataset_name: Llama-3.1-70B-Instruct-evals__gpqa__details
+task: meta_gpqa_cot
+output_type: generate_until
+process_docs: !function utils.process_docs
+test_split: latest
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 2048
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/utils.py b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/utils.py
new file mode 100644
index 000000000..6a0349fca
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/gpqa_cot/utils.py
@@ -0,0 +1,20 @@
+import random
+import re
+
+import datasets
+
+
+
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+    dataset = dataset.select_columns(["input_question", "input_correct_responses", "input_final_prompts", "is_correct","input_question_hash","input_choice_list","output_prediction_text"])
+    dataset = dataset.rename_column("is_correct","previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/ifeval.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/ifeval.yaml
new file mode 100644
index 000000000..c7196d16d
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/ifeval.yaml
@@ -0,0 +1,32 @@
+task: meta_ifeval
+dataset_path: parquet
+dataset_kwargs:
+  data_files: ./work_dir/joined_ifeval.parquet
+output_type: generate_until
+test_split: train
+num_fewshot: 0
+doc_to_text: prompt
+doc_to_target: 0
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0.0
+  max_gen_toks: 1280
+process_results: !function utils.process_results
+metric_list:
+  - metric: prompt_level_strict_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_strict_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+  - metric: prompt_level_loose_acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: inst_level_loose_acc
+    aggregation: !function utils.agg_inst_level_acc
+    higher_is_better: true
+metadata:
+  version: 2.0
+fewshot_config:
+  sampler: first_n
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/utils.py b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/utils.py
new file mode 100644
index 000000000..7caecb1a2
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/ifeval/utils.py
@@ -0,0 +1,139 @@
+import dataclasses
+from typing import Dict, Optional, Union
+
+from lm_eval.tasks.ifeval import instructions_registry
+
+
+@dataclasses.dataclass
+class InputExample:
+    key: int
+    instruction_id_list: list[str]
+    prompt: str
+    kwargs: list[Dict[str, Optional[Union[str, int]]]]
+
+
+@dataclasses.dataclass
+class OutputExample:
+    instruction_id_list: list[str]
+    prompt: str
+    response: str
+    follow_all_instructions: bool
+    follow_instruction_list: list[bool]
+
+
+def test_instruction_following_strict(
+    inp,
+    response,
+):
+    """Tests response to see if instructions are followed."""
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+                
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        if response.strip() and instruction.check_following(response):
+            is_following_list.append(True)
+        else:
+            is_following_list.append(False)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def test_instruction_following_loose(
+    inp,
+    response,
+):
+    """Tests response for an upper bound for following instructions."""
+    r = response.split("\n")
+    response_remove_first = "\n".join(r[1:]).strip()
+    response_remove_last = "\n".join(r[:-1]).strip()
+    response_remove_both = "\n".join(r[1:-1]).strip()
+    revised_response = response.replace("*", "")
+    revised_response_remove_first = response_remove_first.replace("*", "")
+    revised_response_remove_last = response_remove_last.replace("*", "")
+    revised_response_remove_both = response_remove_both.replace("*", "")
+    all_responses = [
+        response,
+        revised_response,
+        response_remove_first,
+        response_remove_last,
+        response_remove_both,
+        revised_response_remove_first,
+        revised_response_remove_last,
+        revised_response_remove_both,
+    ]
+    instruction_list = inp.instruction_id_list
+    is_following_list = []
+
+    for index, instruction_id in enumerate(instruction_list):
+        instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id]
+        instruction = instruction_cls(instruction_id)
+
+        # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method.
+        kwargs = {k: v for k, v in inp.kwargs[index].items() if v}
+        instruction.build_description(**kwargs)
+        args = instruction.get_instruction_args()
+        if args and "prompt" in args:
+            instruction.build_description(prompt=inp.prompt)
+
+        is_following = False
+        for r in all_responses:
+            if r.strip() and instruction.check_following(r):
+                is_following = True
+                break
+
+        is_following_list.append(is_following)
+
+    return OutputExample(
+        instruction_id_list=inp.instruction_id_list,
+        prompt=inp.prompt,
+        response=response,
+        follow_all_instructions=all(is_following_list),
+        follow_instruction_list=is_following_list,
+    )
+
+
+def process_results(doc, results):
+    new_kwargs = []
+    for item in doc["kwargs"]:
+        if item["nth_paragraph"]:
+            item["nth_paragraph"] = int(item["nth_paragraph"])
+        new_kwargs.append(item)
+    inp = InputExample(
+        key=doc["key"],
+        instruction_id_list=doc["instruction_id_list"],
+        prompt=doc["prompt"],
+        kwargs=new_kwargs,
+    )
+    response = results[0]
+
+    out_strict = test_instruction_following_strict(inp, response)
+    out_loose = test_instruction_following_loose(inp, response)
+
+    return {
+        "prompt_level_strict_acc": out_strict.follow_all_instructions,
+        "inst_level_strict_acc": out_strict.follow_instruction_list,
+        "prompt_level_loose_acc": out_loose.follow_all_instructions,
+        "inst_level_loose_acc": out_loose.follow_instruction_list,
+    }
+
+
+def agg_inst_level_acc(items):
+    flat_items = [item for sublist in items for item in sublist]
+    inst_level_acc = sum(flat_items) / len(flat_items)
+    return inst_level_acc
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_4shot_cot.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_4shot_cot.yaml
new file mode 100644
index 000000000..edd95630d
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_4shot_cot.yaml
@@ -0,0 +1,21 @@
+dataset_path: parquet
+dataset_kwargs:
+  data_files: ./work_dir/joined_math.parquet
+task: meta_math
+process_docs: !function utils.process_docs
+output_type: generate_until
+test_split: train
+doc_to_text:  !function utils.doc_to_text
+process_results: !function utils.process_results
+doc_to_target: answer
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
\ No newline at end of file
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_hard_0shot_cot.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_hard_0shot_cot.yaml
new file mode 100644
index 000000000..b7b4cb965
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/math_hard_0shot_cot.yaml
@@ -0,0 +1,21 @@
+dataset_path: parquet
+dataset_kwargs:
+  data_files: ./work_dir/joined_math_hard.parquet
+task: meta_math_hard
+process_docs: !function utils.process_docs
+output_type: generate_until
+test_split: train
+doc_to_text:  !function utils.doc_to_text
+process_results: !function utils.process_results
+doc_to_target: answer
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 5120
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/utils.py b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/utils.py
new file mode 100644
index 000000000..3b595ccfa
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/math_hard/utils.py
@@ -0,0 +1,268 @@
+# Most of the code taken from https://github.com/EleutherAI/lm-evaluation-harness/blob/cddce0a148ec1710e2d60546c6f92727dd8a78fd/lm_eval/tasks/leaderboard/math/utils.py
+import re
+import signal
+from typing import Dict, List, Optional
+
+import datasets
+
+from lm_eval.utils import eval_logger
+
+
+try:
+    import sympy
+    from sympy.parsing.latex import parse_latex
+except ModuleNotFoundError:
+    raise ModuleNotFoundError(
+        "`sympy` is required for generating translation task prompt templates. \
+please install sympy via pip install lm-eval[math] or pip install -e .[math]",
+    )
+
+# taken from
+# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py
+def doc_to_text(doc: dict) -> str:
+    return doc["input_final_prompts"][0]
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "answer": normalize_final_answer(
+                 remove_boxed(last_boxed_only_string(doc["solution"]))
+            ),
+            "meta_target": doc["input_correct_responses"]
+        }
+        return out_doc
+    return dataset.map(_process_doc)
+
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    candidates = results[0]
+    last_boxed_string = last_boxed_only_string(candidates)
+    if not last_boxed_string:
+        # No boxed string found, so we can't evaluate
+        return {"exact_match": 0}
+    unnormalized_answer = remove_boxed(last_boxed_string)
+    answer = normalize_final_answer(unnormalized_answer)
+
+    if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]):
+        retval = 1
+    else:
+        retval = 0
+
+    results = {
+        "exact_match": retval,
+    }
+    return results
+
+
+def last_boxed_only_string(string: str) -> Optional[str]:
+    idx = string.rfind("\\boxed")
+    if "\\boxed " in string:
+        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
+    if idx < 0:
+        idx = string.rfind("\\fbox")
+        if idx < 0:
+            return None
+
+    i = idx
+    right_brace_idx = None
+    num_left_braces_open = 0
+    while i < len(string):
+        if string[i] == "{":
+            num_left_braces_open += 1
+        if string[i] == "}":
+            num_left_braces_open -= 1
+            if num_left_braces_open == 0:
+                right_brace_idx = i
+                break
+        i += 1
+
+    if right_brace_idx is None:
+        retval = None
+    else:
+        retval = string[idx : right_brace_idx + 1]
+
+    return retval
+
+
+def remove_boxed(s: str) -> str:
+    if "\\boxed " in s:
+        left = "\\boxed "
+        assert s[: len(left)] == left
+        return s[len(left) :]
+
+    left = "\\boxed{"
+
+    assert s[: len(left)] == left
+    assert s[-1] == "}"
+
+    return s[len(left) : -1]
+
+
+class timeout:
+    def __init__(self, seconds=1, error_message="Timeout"):
+        self.seconds = seconds
+        self.error_message = error_message
+
+    def handle_timeout(self, signum, frame):
+        raise TimeoutError(self.error_message)
+
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self.handle_timeout)
+        signal.alarm(self.seconds)
+
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
+
+
+def is_equiv(x1: str, x2: str) -> bool:
+    """
+    x1 and x2 are normalized latex string
+    """
+    try:
+        with timeout(seconds=5):
+            try:
+                parsed_x1 = parse_latex(x1)
+                parsed_x2 = parse_latex(x2)
+            except (
+                sympy.parsing.latex.errors.LaTeXParsingError,
+                sympy.SympifyError,
+                TypeError,
+            ):
+                eval_logger.debug(f"couldn't parse one of {x1} or {x2}")
+                return False
+
+            try:
+                diff = parsed_x1 - parsed_x2
+            except TypeError:
+                eval_logger.debug(f"couldn't subtract {x1} and {x2}")
+                return False
+
+            try:
+                if sympy.simplify(diff) == 0:
+                    return True
+                else:
+                    return False
+            except ValueError:
+                eval_logger.debug(
+                    f"Had some trouble simplifying when comparing {x1} and {x2}"
+                )
+    except TimeoutError:
+        eval_logger.debug(f"Timed out comparing {x1} and {x2}")
+        return False
+    except ImportError as e:
+        eval_logger.error(e)
+        raise
+    except Exception as e:
+        eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}")
+        return False
+
+
+def get_unnormalized_answer(text: str) -> str:
+    INVALID_ANSWER = "[invalidanswer]"
+    end_seq = "I hope it is correct."
+    text += end_seq
+    match = re.search(
+        r"Final Answer: The final answer is(.*?). I hope it is correct.",
+        text,
+    )
+    if match:
+        return match.group(1).strip()
+    else:
+        return INVALID_ANSWER
+
+
+SUBSTITUTIONS = [
+    ("an ", ""),
+    ("a ", ""),
+    (".$", "$"),
+    ("\\$", ""),
+    (r"\ ", ""),
+    (" ", ""),
+    ("mbox", "text"),
+    (",\\text{and}", ","),
+    ("\\text{and}", ","),
+    ("\\text{m}", "\\text{}"),
+]
+REMOVED_EXPRESSIONS = [
+    "square",
+    "ways",
+    "integers",
+    "dollars",
+    "mph",
+    "inches",
+    "ft",
+    "hours",
+    "km",
+    "units",
+    "\\ldots",
+    "sue",
+    "points",
+    "feet",
+    "minutes",
+    "digits",
+    "cents",
+    "degrees",
+    "cm",
+    "gm",
+    "pounds",
+    "meters",
+    "meals",
+    "edges",
+    "students",
+    "childrentickets",
+    "multiples",
+    "\\text{s}",
+    "\\text{.}",
+    "\\text{\ns}",
+    "\\text{}^2",
+    "\\text{}^3",
+    "\\text{\n}",
+    "\\text{}",
+    r"\mathrm{th}",
+    r"^\circ",
+    r"^{\circ}",
+    r"\;",
+    r",\!",
+    "{,}",
+    '"',
+    "\\dots",
+]
+
+
+def normalize_final_answer(final_answer: str) -> str:
+    """
+    Normalize a final answer to a quantitative reasoning question.
+
+    Copied character for character from appendix D of Lewkowycz et al. (2022)
+    """
+    final_answer = final_answer.split("=")[-1]
+
+    for before, after in SUBSTITUTIONS:
+        final_answer = final_answer.replace(before, after)
+    for expr in REMOVED_EXPRESSIONS:
+        final_answer = final_answer.replace(expr, "")
+
+    # Extract answer that is in LaTeX math, is bold,
+    # is surrounded by a box, etc.
+    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
+    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
+    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
+
+    # Normalize shorthand TeX:
+    #  \fracab -> \frac{a}{b}
+    #  \frac{abc}{bef} -> \frac{abc}{bef}
+    #  \fracabc -> \frac{a}{b}c
+    #  \sqrta -> \sqrt{a}
+    #  \sqrtab -> sqrt{a}b
+    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
+    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
+    final_answer = final_answer.replace("$", "")
+
+    # Normalize 100,000 -> 100000
+    if final_answer.replace(",", "").isdigit():
+        final_answer = final_answer.replace(",", "")
+
+    return final_answer
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_instruct.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_instruct.yaml
new file mode 100644
index 000000000..676696d97
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_instruct.yaml
@@ -0,0 +1,4 @@
+group: meta_instruct
+task:
+- meta_mmlu_instruct
+# - meta_mmlu_pro_instruct
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_pretrain.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_pretrain.yaml
new file mode 100644
index 000000000..4cb1c5f2c
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/meta_pretrain.yaml
@@ -0,0 +1,4 @@
+group: meta_pretrain
+task:
+- meta_bbh
+- meta_mmlu_pro_pretrain
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/mmlu.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/mmlu.yaml
new file mode 100644
index 000000000..1b52817fb
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/mmlu.yaml
@@ -0,0 +1,29 @@
+task: meta_mmlu_instruct
+dataset_path: meta-llama/Llama-3.3-70B-Instruct-evals
+dataset_name: Llama-3.3-70B-Instruct-evals__mmlu__0_shot__cot__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py
new file mode 100644
index 000000000..9fdc3d277
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py
@@ -0,0 +1,52 @@
+import string
+
+import datasets
+
+
+def doc_to_text(doc: dict) -> str:
+    question, choice = doc["input_question"], str(doc["input_choice_list"])
+    prompt = [
+        "You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.",
+        "You are a skilled analyst and educator with expertise in critical thinking, analytical reasoning, and multiple-choice question strategy. Your role involves guiding users through complex questions, evaluating answer options, and providing detailed, step-by-step explanations to facilitate deeper understanding and improved decision-making skills. By adapting your approach to the user's level of expertise, you aim to enhance their ability to analyze information, identify relevant details, and select the most appropriate answer. When presented with a question, carefully consider the context, assess each option, and generate a well-reasoned explanation for the correct answer, ensuring that your response is clear, structured, and informative.",
+        "You are a knowledgeable and insightful expert designed to provide guidance on MCQ(s)! Your role involves analyzing Qs, assessing answer choices, and offering well-reasoned reasons to enhance understanding. By breaking down complex concepts using CoT thinking step by step, you can solve any problem in this dataset. Make sure at all costs to present information in a clear, structured manner while using user level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments!!",
+        "To provide a well-reasoned explanation for a multiple-choice question, analyze the question and assess the answer choices. Break down complex concepts into clear and structured information, adapting to the user's level of expertise. Present a step-by-step reasoning process to arrive at the correct answer, and clearly state the correct answer letter. Ensure the explanation is concise and easy to understand, fostering deeper comprehension and confidence in tackling multiple-choice assessments.",
+    ]
+    default_parsing_text = "Regardless of the approach, always conclude with:\nThe best answer is [the_answer_letter].\nwhere the [the_answer_letter] is one of A, B, C or D."
+    template = f"<|start_header_id|>user<|end_header_id|>{prompt[3]}. {default_parsing_text} Question: {question}\n {choice}\n<|eot_id|> \n\n<|start_header_id|>assistant<|end_header_id|>"
+    return template
+
+
+# def doc_to_text(doc: dict) -> str:
+#     # Strip out the last two characters, which is a space and the answer
+#     # E.g., "Answer: B" -> "Answer:"
+#     return doc["input_final_prompts"][0][:-2]
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+
+    # dataset = dataset.select(range(1500, len(dataset)))
+
+    dataset = dataset.select_columns(
+        [
+            "input_question",
+            "input_correct_responses",
+            "input_final_prompts",
+            "is_correct",
+            "input_question_hash",
+            "input_choice_list",
+            "output_prediction_text",
+        ]
+    )
+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
+
+
+def doc_to_target(doc: dict) -> str:
+    return doc["gold"]
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
new file mode 100644
index 000000000..0c5e44d0a
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_instruct.yaml
@@ -0,0 +1,29 @@
+task: meta_mmlu_pro_instruct
+dataset_path: meta-llama/Llama-3.1-70B-Instruct-evals
+dataset_name: Llama-3.1-70B-Instruct-evals__mmlu_pro__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        group_select: -1
+        regex_pattern: 'best answer is ([A-Z])'
+      - function: "take_first"
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
new file mode 100644
index 000000000..fb1369a1e
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/mmlu_pro_5shot_cot_pretrain.yaml
@@ -0,0 +1,28 @@
+task: meta_mmlu_pro_pretrain
+dataset_path: meta-llama/Llama-3.1-70B-evals
+dataset_name: Llama-3.1-70B-evals__mmlu_pro__details
+test_split: latest
+output_type: generate_until
+process_docs: !function utils.process_docs
+doc_to_text: !function utils.doc_to_text
+doc_to_target: gold
+filter_list:
+  - name: "strict-match"
+    filter:
+      - function: "regex"
+        regex_pattern: 'answer is \(([A-Z])\)'
+      - function: "take_first"
+generation_kwargs:
+  until: "\n\nQ: "
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 512
+num_fewshot: 0
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  version: 1.0
diff --git a/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/utils.py b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/utils.py
new file mode 100644
index 000000000..ab1889d59
--- /dev/null
+++ b/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu_pro/utils.py
@@ -0,0 +1,153 @@
+import string
+
+import datasets
+
+
+def doc_to_text(doc: dict) -> str:
+    question, choice = doc["input_question"], str(doc["input_choice_list"])
+    prompt = [
+        "You are a helpful assistant for multiple choice questions.",
+        "To address the multiple-choice question effectively, carefully read and analyze the provided `question`. Then, evaluate each option in the `options` dictionary, considering the context and any relevant information that might influence the correct choice. In the `reasoning` field, outline your thought process step by step, ensuring that your logic is clear and easy to follow. Finally, select the most appropriate answer based on your analysis and present it as the `answer`. Remember to justify your choice with sound reasoning, making it easier for others to understand your decision-making process.",
+    ]
+    default_parsing_text = 'Your response should end with "The best answer is [the_answer_letter]." where the [the_answer_letter] is a letter from the provided choices.'
+    template = f"<|start_header_id|>user<|end_header_id|>{prompt[1]}. {default_parsing_text} Question: {question}\n {choice}\n<|eot_id|> \n\n<|start_header_id|>assistant<|end_header_id|>"
+    return template
+
+    # def doc_to_text(doc: dict) -> str:
+    #     input_question, input_choice = doc["input_question"], str(doc["input_choice_list"])
+
+    # optimized_examples = [
+    #     {
+    #         "question": "Explain what difficulties would arise if messenger RNA molecules were not destroyed after they had produced some polypeptide chains.",
+    #         "options": {
+    #             "A": "mRNA would replicate rapidly",
+    #             "B": "The cell would use mRNA as a source of energy",
+    #             "C": "The cell would lack proteins",
+    #             "D": "Cell would enter a state of permanent division",
+    #             "E": "mRNA would be transformed into DNA",
+    #             "F": "Excess protein production, energy depletion, and potential harm to the cell",
+    #             "G": "mRNA would exit the cell and infect neighboring cells",
+    #             "H": "Proteins would be broken down into mRNA",
+    #             "I": "mRNA would become part of the cell membrane",
+    #             "J": "mRNA would bind to lipids and carbohydrates, disrupting cellular metabolism",
+    #         },
+    #         "answer": "F",
+    #     },
+    #     {
+    #         "question": "Based on the characteristic population curves that result from plotting population growth of a species, the most effective means of controlling the mosquito population is to",
+    #         "options": {
+    #             "A": "opt for zero population control once the K value of the curve has been reached",
+    #             "B": "maintain the population at the highest point of its logistic curve",
+    #             "C": "reduce the carrying capacity cif the environment to lower the K value",
+    #             "D": "decrease the mortality rate",
+    #             "E": "increase the birth rate of the species",
+    #             "F": "drastically reduce the population below the K value",
+    #             "G": "maintain the population at a point corresponding to the midpoint of its logistic curve",
+    #             "H": "increase the carrying capacity of the environment to raise the K value",
+    #             "I": "opt for zero population control at the beginning of the logistic curve",
+    #             "J": "null",
+    #         },
+    #         "answer": "C",
+    #     },
+    #     {
+    #         "question": "Solve the equation 1.2 = 0.4y using mental math.",
+    #         "options": {
+    #             "A": "3",
+    #             "B": "\u22123",
+    #             "C": "0.8",
+    #             "D": "2",
+    #             "E": "0.3",
+    #             "F": "5",
+    #             "G": "\u22124",
+    #             "H": "4",
+    #             "I": "6",
+    #             "J": "1.6",
+    #         },
+    #         "answer": "A",
+    #     },
+    #     {
+    #         "question": "assume you are Indonesian. In 2010, the rupiah exchange rate was around IDR15,000/USD, and the consumer price index in Indonesia and the United States was at 100. In 2019, the exchange rate changed to IDR14,000/USD. Simultaneously, Indonesia\u2019s inflation rose 5% due to the consumer price index rising to 105. Meanwhile, the United States\u2019 inflation rate rose 10% due to the consumer price index rising to 110. Whats the real exchange rate?",
+    #         "options": {
+    #             "A": "14000.00",
+    #             "B": "15500.00",
+    #             "C": "15000.00",
+    #             "D": "16000.00",
+    #             "E": "13500.00",
+    #             "F": "14666.67",
+    #             "G": "13888.89",
+    #             "H": "14800.00",
+    #             "I": "15200.00",
+    #             "J": "13333.33",
+    #         },
+    #         "answer": "F",
+    #     },
+    #     {
+    #         "question": "To move the economy closer to full employment the central bank decides that the federal funds rate must be increased. The appropriate open market operation is to ______ which ______ the money supply ______ aggregate demand and fights ______. OMO \u00a0\u00a0\u00a0 MONEY SUPPLY \u00a0\u00a0\u00a0 AD \u00a0\u00a0\u00a0 TO FIGHT",
+    #         "options": {
+    #             "A": "Buy bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
+    #             "B": "Sell bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
+    #             "C": "Buy bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
+    #             "D": "Sell bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Inflation",
+    #             "E": "Buy bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Unemployment",
+    #             "F": "Sell bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Unemployment",
+    #             "G": "Buy bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Inflation",
+    #             "H": "Sell bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Inflation",
+    #             "I": "Buy bonds \u00a0\u00a0\u00a0 Decreases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Inflation",
+    #             "J": "Sell bonds \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Increases \u00a0\u00a0\u00a0 Inflation",
+    #         },
+    #         "answer": "H",
+    #     },
+    # ]
+
+
+#     formatted_text = ""
+#     for question in optimized_examples:
+#         # Format user message with question and options
+#         user_msg = "<|start_header_id|>user<|end_header_id|>You are a helpful assistant designed to help with multiple choice question. Think step by step."
+#         user_msg += f"Question: {question['question']}\n"
+
+#         # Add options
+#         for option_key, option_value in question["options"].items():
+#             if option_value is not None:
+#                 user_msg += f"{option_key}. {option_value}\n"
+
+#         user_msg += '\nYour response should end with "The best answer is [the_answer_letter]." where the [the_answer_letter] is a letter from the provided choices.\n\n'
+#         user_msg += f"Let's think step by step. Question: {input_question}\n {input_choice}\n <|eot_id|>"
+
+#         formatted_text += user_msg
+
+#         # Add assistant placeholder message
+#     assistant_msg = "<|start_header_id|>assistant<|end_header_id|>\n\n<|eot_id|>"
+
+#     formatted_text += assistant_msg
+
+#     # print(formatted_text)
+#     # exit()
+
+#     return formatted_text
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict) -> dict:
+        out_doc = {
+            "problem": doc["input_question"],
+            "gold": doc["input_correct_responses"][0],
+        }
+        return out_doc
+
+    dataset = dataset.select(range(1200, len(dataset)))
+
+    dataset = dataset.select_columns(
+        [
+            "input_question",
+            "input_correct_responses",
+            "input_final_prompts",
+            "is_correct",
+            "input_question_hash",
+            "input_choice_list",
+            "output_prediction_text",
+        ]
+    )
+    dataset = dataset.rename_column("is_correct", "previously_is_correct")
+    dataset = dataset.map(_process_doc)
+    return dataset.map(_process_doc)
diff --git a/end-to-end-use-cases/prompt-migration/.gitignore b/end-to-end-use-cases/prompt-migration/.gitignore
new file mode 100644
index 000000000..b488f1ba7
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/.gitignore
@@ -0,0 +1,3 @@
+/mmlu_pro_data
+
+*.csv
diff --git a/end-to-end-use-cases/prompt-migration/README.md b/end-to-end-use-cases/prompt-migration/README.md
new file mode 100644
index 000000000..e03bf28cb
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/README.md
@@ -0,0 +1,105 @@
+# Prompt Migration
+
+## Overview
+
+The **Prompt Migration** toolkit helps you assess and adapt prompts across different language models, ensuring consistent performance and reliability. It includes benchmarking capabilities and evaluation tools to measure the effectiveness of prompt migrations.
+
+## Project Structure
+
+- `notebooks/`: Contains Jupyter notebooks for interactive prompt migration examples
+  - `harness.ipynb`: Main notebook demonstrating the prompt migration workflow
+- `benchmarks/`: Tools and scripts for performance evaluation
+- `environment.yml`: Conda environment specification with all required dependencies
+
+## Prerequisites
+
+1. **Conda Environment**
+   - [Miniconda](https://docs.conda.io/en/latest/miniconda.html) or [Anaconda](https://www.anaconda.com/) installed
+   - Python 3.10
+   - Create and activate the environment:
+     ```bash
+     conda env create -f environment.yml
+     conda activate prompt-migration
+     ```
+
+2. **Setting Up vLLM for Inference**
+   If you plan to use [vLLM](https://github.com/vllm-project/vllm) for model inference:
+   ```bash
+   pip install vllm
+   ```
+   To serve a large model (example: Meta’s Llama 3.3 70B Instruct), you might run:
+   ```bash
+   vllm serve meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size=4
+   ```
+   Adjust the model name and `--tensor-parallel-size` according to your hardware and parallelization needs.
+
+3. **Accessing Hugging Face Datasets**
+   If you need to work with private or gated Hugging Face datasets, follow these steps:
+   1. **Create a Hugging Face account (if you don’t have one):**
+      Visit [Hugging Face](https://huggingface.co/) and create an account.
+   2. **Authenticate via the Hugging Face CLI:**
+      - Log in to Hugging Face:
+        ```bash
+        huggingface-cli login
+        ```
+      - Enter your Hugging Face credentials (username and token). You can generate or retrieve your token in your [Hugging Face settings](https://huggingface.co/settings/tokens).
+   3. **Check Dataset Permissions:**
+      Some datasets may require explicit permission from the dataset owner. If you continue to have access issues, visit the dataset page on Hugging Face to request or confirm your access rights.
+
+## Key Dependencies
+
+- **DSPy**: For prompt engineering and evaluation
+- **LM-eval**: Evaluation framework for language models
+- **PyTorch** and **Transformers**: For model inference
+
+## Getting Started
+
+1. **Activate your environment:**
+   ```bash
+   conda activate prompt-migration
+   ```
+2. **Start Jupyter notebook server:**
+   ```bash
+   jupyter notebook
+   ```
+3. **Open the main notebook:**
+   Navigate to the `notebooks/harness.ipynb` in your browser to get started.
+
+4. **Configure MMLU Benchmark:**
+   In the notebook, modify the benchmark configuration to use MMLU:
+   ```python
+   from benchmarks import llama_mmlu # You can also choose other available from `benchmarks/`
+   benchmark = llama_mmlu
+   ```
+
+5. **Run Optimization:**
+   Choose an optimization level from the notebook and run the optimizer:
+   ```python
+   # Choose one: "light", "medium", or "heavy"
+   optimizer = dspy.MIPROv2(metric=benchmark.metric, auto="medium")
+   optimized_program = optimizer.compile(student, trainset=trainset)
+   
+   # View the optimized prompt and/or demos
+   print("BEST PROMPT:\n", optimized_program.signature.instructions)
+   print("BEST EXAMPLES:\n", optimized_program.predict.demos)
+   ```
+
+6. **Run base and optimized prompt on meta-evals:**
+   Take the optimized prompt and examples and update your working directory:
+   - Navigate to `llama-recipes/end-to-end-use-cases/benchmarks/llm_eval_harness/meta_eval/work_dir/mmlu/utils.py`
+   - Open a new terminal and setup meta-evals environment following the readme in /meta_eval
+   - Update the prompts list with your base and optimized prompts as the first two items
+   ```python
+   prompts = ["base_prompt", "optimized_prompt"] # Your base prompt and optimized prompt
+   ```
+   - run lm_eval twice, once for base prompt and once for optimized prompt by changing the `prompt` index in template as such:
+   ```python
+      template = f"<|start_header_id|>user<|end_header_id|>{prompt[0]}. Question: {question}\n {choice}\n<|eot_id|> \n\n<|start_header_id|>assistant<end_header_id|>"
+   ```
+
+7. **Explore Benchmarks:**
+   Use the scripts in the `benchmarks/` directory to evaluate your prompt migrations.
+
+## License
+
+This project is part of the **Llama Recipes** collection. Please refer to the main repository’s license for usage terms.
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/__init__.py b/end-to-end-use-cases/prompt-migration/benchmarks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/datatypes.py b/end-to-end-use-cases/prompt-migration/benchmarks/datatypes.py
new file mode 100644
index 000000000..5347aeeaf
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/datatypes.py
@@ -0,0 +1,10 @@
+import typing as t
+
+if t.TYPE_CHECKING:
+    import dspy
+
+
+class TaskDatasets(t.NamedTuple):
+    trainset: t.Iterable["dspy.Example"]
+    valset: t.Iterable["dspy.Example"]
+    testset: t.Iterable["dspy.Example"]
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/download_mmlu_pro.py b/end-to-end-use-cases/prompt-migration/benchmarks/download_mmlu_pro.py
new file mode 100644
index 000000000..449bbe3ef
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/download_mmlu_pro.py
@@ -0,0 +1,30 @@
+import os
+
+import pandas as pd
+from datasets import load_dataset
+
+
+def download_mmlu_pro():
+    # Create output directory if it doesn't exist
+    output_dir = "mmlu_pro_data"
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Load the dataset
+    dataset = load_dataset("TIGER-Lab/MMLU-Pro")
+
+    # Convert each split to CSV
+    for split in dataset.keys():
+        # Convert to pandas DataFrame
+        df = pd.DataFrame(dataset[split])
+
+        # Save to CSV
+        output_path = os.path.join(output_dir, f"mmlu_pro_{split}.csv")
+        df.to_csv(output_path, index=False)
+        print(f"Saved {split} split to {output_path}")
+        print(f"Number of examples in {split}: {len(df)}")
+
+
+if __name__ == "__main__":
+    print("Downloading MMLU-Pro dataset...")
+    download_mmlu_pro()
+    print("Download complete!")
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/helpers.py b/end-to-end-use-cases/prompt-migration/benchmarks/helpers.py
new file mode 100644
index 000000000..8215840cf
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/helpers.py
@@ -0,0 +1,53 @@
+import typing as t
+
+from .datatypes import TaskDatasets
+
+if t.TYPE_CHECKING:
+    import dspy
+    from datasets import Dataset
+
+
+def train_val_test_split(
+    dataset: "Dataset",
+    mapper: t.Callable[[dict], "dspy.Example"],
+    train_size: float = 0.1,
+    validation_size: float = 0.1,
+) -> TaskDatasets:
+    docs = dataset.train_test_split(train_size=train_size)
+    train_docs = docs["train"]
+    docs = docs["test"].train_test_split(train_size=validation_size)
+    validation_docs = docs["train"]
+    test_docs = docs["test"]
+    return TaskDatasets(
+        trainset=list(map(mapper, train_docs)),
+        valset=list(map(mapper, validation_docs)),
+        testset=list(map(mapper, test_docs)),
+    )
+
+
+def fixed_split(
+    dataset: "Dataset",
+    mapper: t.Callable[[dict], "dspy.Example"],
+    train_size: int = 1000,
+    validation_size: int = 200,
+) -> TaskDatasets:
+    """Split dataset by taking first N examples instead of random sampling.
+
+    Args:
+        dataset: Input dataset
+        mapper: Function to map dataset examples to dspy.Example
+        train_size: Number of examples to use for training (default: 1000)
+        validation_size: Number of examples to use for validation (default: 200)
+
+    Returns:
+        TaskDatasets containing train, validation and test splits
+    """
+    train_docs = dataset.select(range(train_size))
+    validation_docs = dataset.select(range(train_size, train_size + validation_size))
+    test_docs = dataset.select(range(train_size + validation_size, len(dataset)))
+
+    return TaskDatasets(
+        trainset=list(map(mapper, train_docs)),
+        valset=list(map(mapper, validation_docs)),
+        testset=list(map(mapper, test_docs)),
+    )
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/humaneval.py b/end-to-end-use-cases/prompt-migration/benchmarks/humaneval.py
new file mode 100644
index 000000000..754c1389a
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/humaneval.py
@@ -0,0 +1,77 @@
+import typing as t
+
+from bigcode_eval.tasks import humaneval
+from bigcode_eval.tasks.custom_metrics.execute import check_correctness
+from datasets import load_dataset
+from lm_eval.evaluator_utils import eval_logger
+import dspy
+
+from .datatypes import TaskDatasets
+from .helpers import train_val_test_split
+
+if t.TYPE_CHECKING:
+    from bigcode_eval.base import Task
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class HumanEval(dspy.Signature):
+        __doc__ = instructions
+        prompt: str = dspy.InputField()
+        solution: str = dspy.OutputField()
+
+    return HumanEval
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    program = gold.prompt + "\n" + pred.solution + "\n" + gold.dspy_test
+    result = check_correctness(
+        program,
+        timeout=30,
+        task_id=gold.dspy_task_id,
+        completion_id=None,
+    )
+
+    if result["passed"]:
+        return True
+
+    eval_logger.debug(f"{gold.dspy_task_id}: {result['result']}")
+    return False
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    dataset = load_dataset("codeparrot/instructhumaneval")
+    train_docs, validation_docs, test_docs = train_val_test_split(
+        dataset,
+        train_size=train_size,
+        validation_size=validation_size,
+    )
+
+    return TaskDatasets(
+        trainset=map(_task_doc_example, train_docs),
+        valset=map(_task_doc_example, validation_docs),
+        testset=map(_task_doc_example, test_docs),
+    )
+
+
+class TaskDoc(t.TypedDict):
+    task_id: str
+    prompt: str
+    canonical_solution: str
+    test: str
+
+
+inputs = ["prompt"]
+outputs = ["solution"]
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    return dspy.Example(
+        prompt=doc["prompt"],
+        solution=doc["canonical_solution"],
+        # dspy_ keys are hidden
+        dspy_task_id=doc["task_id"],
+        dspy_test=doc["test"],
+    ).with_inputs(*inputs)
\ No newline at end of file
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/leaderboard_mmlu_pro.py b/end-to-end-use-cases/prompt-migration/benchmarks/leaderboard_mmlu_pro.py
new file mode 100644
index 000000000..2329835f4
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/leaderboard_mmlu_pro.py
@@ -0,0 +1,61 @@
+import typing as t
+
+from datasets import load_dataset
+import dspy
+
+from .datatypes import TaskDatasets
+from .helpers import train_val_test_split
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class MMLUPro(dspy.Signature):
+        __doc__ = instructions
+        question: str = dspy.InputField()
+        options: list[str] = dspy.InputField()
+        answer: str = dspy.OutputField()
+
+    return MMLUPro
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    return gold.answer == pred.answer
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    dataset = load_dataset("TIGER-Lab/MMLU-Pro")
+    return train_val_test_split(
+        dataset["test"], _task_doc_example, train_size, validation_size
+    )
+
+
+class TaskDoc(t.TypedDict):
+    question_id: int
+    question: str
+    options: list[str]
+    answer: str
+    answer_index: int
+    cot_content: str
+    category: str
+    src: str
+
+
+inputs = ["question", "options"]
+outputs = ["answer"]
+
+
+def _num_letter(n: int) -> str:
+    return chr(ord("A") + n)
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    question = doc["question"]
+    options = [f"{_num_letter(i)}. {option}" for i, option in enumerate(doc["options"])]
+    answer = doc["answer"]
+    return dspy.Example(
+        question=question,
+        options=options,
+        answer=answer,
+    ).with_inputs(*inputs)
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu.py b/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu.py
new file mode 100644
index 000000000..0b621be65
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu.py
@@ -0,0 +1,76 @@
+import typing as t
+
+import dspy
+
+from datasets import load_dataset
+
+from .datatypes import TaskDatasets
+from .helpers import fixed_split, train_val_test_split
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.1,
+) -> TaskDatasets:
+    """
+    Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
+    """
+    dataset = load_dataset(
+        "meta-llama/Llama-3.3-70B-Instruct-evals",
+        "Llama-3.3-70B-Instruct-evals__mmlu__0_shot__cot__details",
+    )
+    return train_val_test_split(dataset["latest"], _task_doc_example)
+
+
+class TaskDoc(t.TypedDict):
+    task_type: str
+    task_name: str
+    subtask_name: str
+    input_question: str
+    input_choice_list: dict
+    input_final_prompts: list
+    input_correct_responses: list
+    output_prediction_text: list
+    output_parsed_answer: str
+    output_choice_completions: t.Optional[int]
+    output_choice_negative_log_likelihoods: t.Optional[int]
+    output_metrics: dict
+    is_correct: bool
+    input_question_hash: str
+    input_final_prompts_hash: list
+    benchmark_label: str
+    eval_config: dict
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    # Get reasoning from output_prediction_text if available
+    # reasoning = (
+    #     doc["output_prediction_text"][0] if doc.get("output_prediction_text") else ""
+    # )
+
+    example = dspy.Example(
+        question=doc["input_question"],
+        options=doc["input_choice_list"],
+        answer=doc["output_parsed_answer"],
+        # reasoning=reasoning,
+    )
+    example._input_keys = {"question", "options"}
+    example._output_keys = {"answer"}
+    return example
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class MMLU(dspy.Signature):
+        __doc__ = instructions
+        question: str = dspy.InputField(desc="The question to be answered")
+        options: dict = dspy.InputField(desc="Dictionary of answer choices")
+        # reasoning: str = dspy.OutputField(
+        #     desc="Step-by-step reasoning for arriving at the answer"
+        # )
+        answer: str = dspy.OutputField(desc="The correct answer letter")
+
+    return MMLU
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    return gold.answer == pred.answer  # Keep focusing on answer accuracy
diff --git a/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py b/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py
new file mode 100644
index 000000000..4e4f0aeb4
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/benchmarks/llama_mmlu_pro.py
@@ -0,0 +1,71 @@
+import typing as t
+
+import dspy
+
+from datasets import load_dataset
+
+from .datatypes import TaskDatasets
+from .helpers import fixed_split, train_val_test_split
+
+
+def datasets(
+    train_size: float = 0.1,
+    validation_size: float = 0.2,
+) -> TaskDatasets:
+    """
+    Load dataset, dataset should be datasets.Dataset type (NOT DatasetDict, OR split the dataset yourself how you want)
+    """
+    dataset = load_dataset(
+        "meta-llama/Llama-3.3-70B-Instruct-evals",
+        "Llama-3.3-70B-Instruct-evals__mmlu_pro__details",
+    )
+    return fixed_split(dataset["latest"], _task_doc_example)
+
+
+class TaskDoc(t.TypedDict):
+    task_type: str
+    task_name: str
+    subtask_name: str
+    input_question: str
+    input_choice_list: dict
+    input_final_prompts: list
+    input_correct_responses: list
+    output_prediction_text: list
+    output_parsed_answer: str
+    output_choice_completions: t.Optional[dict]
+    output_choice_negative_log_likelihoods: t.Optional[dict]
+    output_metrics: dict
+    is_correct: bool
+    input_question_hash: str
+    input_final_prompts_hash: list
+    benchmark_label: str
+    eval_config: dict
+
+
+inputs = ["input_question", "input_choice_list"]
+outputs = ["output_parsed_answer"]
+
+
+def _task_doc_example(doc: TaskDoc) -> dspy.Example:
+    example = dspy.Example(
+        question=doc["input_question"],
+        options=doc["input_choice_list"],
+        answer=doc["output_parsed_answer"],
+    )
+    example._input_keys = {"question", "options"}
+    example._output_keys = {"answer"}
+    return example
+
+
+def signature(instructions: str = "") -> dspy.Signature:
+    class MMLUPro(dspy.Signature):
+        __doc__ = instructions
+        question: str = dspy.InputField(desc="The question to be answered")
+        options: dict = dspy.InputField(desc="Dictionary of answer choices")
+        answer: str = dspy.OutputField(desc="The correct answer letter")
+
+    return MMLUPro
+
+
+def metric(gold: dspy.Example, pred: dspy.Example, trace=False) -> bool:
+    return gold.answer == pred.answer
diff --git a/end-to-end-use-cases/prompt-migration/environment.yml b/end-to-end-use-cases/prompt-migration/environment.yml
new file mode 100644
index 000000000..ed97cd494
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/environment.yml
@@ -0,0 +1,26 @@
+name: prompt-migration
+channels:
+  - defaults
+  - pytorch
+  - conda-forge
+dependencies:
+  - python=3.10  # Updated to match pyproject.toml requires-python
+  - pip
+  - jupyter
+  - notebook
+  - numpy<2  # Matches pyproject.toml dependency
+  - pip:
+      - lm-eval[wandb,api,math,ifeval,sentencepiece]>=0.4.7
+      - python-dotenv>=1.0.1
+      - ipdb>=0.13.13
+      - ipython>=8.31.0
+      - pytest>=8.3.4
+      - ruff>=0.9.1
+      - ipykernel>=6.29.5
+      - torch
+      - transformers
+      - openai
+      - databricks-sdk
+      - python-dotenv
+      - dspy
+      - huggingface_hub
diff --git a/end-to-end-use-cases/prompt-migration/llama3_405b_chat_template.jinja b/end-to-end-use-cases/prompt-migration/llama3_405b_chat_template.jinja
new file mode 100644
index 000000000..4d8334d44
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/llama3_405b_chat_template.jinja
@@ -0,0 +1,107 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = true %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- set date_string = "26 Jul 2024" %}
+{%- endif %}
+
+{# Extract system message if available #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "" %}
+{%- endif %}
+
+{# System message + tool setup #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if builtin_tools is defined or tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{%- if builtin_tools is defined %}
+    {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{# Handle custom tools in user messages #}
+{%- if tools_in_user_message and not tools is none %}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a JSON for a function call " }}
+    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{# Process messages #}
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {%- if not message.tool_calls|length == 1 %}
+            {{- raise_exception("This model only supports single tool-calls at once!") }}
+        {%- endif %}
+        {%- set tool_call = message.tool_calls[0].function %}
+        {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+            {%- for arg_name, arg_val in tool_call.arguments | items %}
+                {{- arg_name + '="' + arg_val + '"' }}
+                {%- if not loop.last %}
+                    {{- ", " }}
+                {%- endif %}
+            {%- endfor %}
+            {{- ")" }}
+        {%- else %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+            {{- '{"name": "' + tool_call.name + '", ' }}
+            {{- '"parameters": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- "}" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {{- "<|eom_id|>" }}
+        {%- else %}
+            {{- "<|eot_id|>" }}
+        {%- endif %}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping or message.content is iterable %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- message.content }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/end-to-end-use-cases/prompt-migration/notebooks/config.py b/end-to-end-use-cases/prompt-migration/notebooks/config.py
new file mode 100644
index 000000000..d8c520bbb
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/notebooks/config.py
@@ -0,0 +1,59 @@
+import os
+
+MODEL_CONFIGS = {
+    "vllm_llama_70b": {
+        "model": "hosted_vllm/meta-llama/Llama-3.3-70B-Instruct",
+        "api_base": "http://localhost:8001/v1",
+        "api_key": None,
+        "port": 8001,
+        "cuda_devices": "4,5,6,7",
+        "tensor_parallel": 4,
+        "gpu_util": 0.90,
+        "chat_template": None,
+    },
+    "vllm_llama_90b": {
+        "model": "hosted_vllm/meta-llama/Llama-3.2-90B-Vision-Instruct",
+        "api_base": "http://localhost:8090/v1",
+        "api_key": None,
+        "port": 8090,
+        "cuda_devices": "4,5,6,7",
+        "tensor_parallel": 4,
+        "gpu_util": 0.70,
+        "chat_template": None,
+    },
+    "vllm_llama_405b": {
+        "model": "hosted_vllm/meta-llama/Llama-3.1-405B-FP8",
+        "api_base": "http://localhost:8405/v1",
+        "api_key": None,
+        "port": 8405,
+        "cuda_devices": "0,1,2,3,4,5,6,7",
+        "tensor_parallel": 8,
+        "gpu_util": 0.80,
+        "chat_template": "./llama3_405b_chat_template.jinja",
+    },
+    "vllm_llama_8b": {
+        "model": "hosted_vllm/meta-llama/Llama-3.1-8B-Instruct",
+        "api_base": "http://localhost:8008/v1",
+        "api_key": None,
+        "port": 8008,
+        "cuda_devices": "0",
+        "tensor_parallel": 1,
+        "gpu_util": 0.95,
+        "chat_template": None,
+    },
+    "openrouter_gpt4o": {
+        "model": "openrouter/openai/gpt-4o",
+        "api_base": "https://openrouter.ai/api/v1",
+        "api_key": os.getenv("OPENROUTER_API_KEY"),
+    },
+    "openrouter_gpt4o_mini": {
+        "model": "openrouter/openai/gpt-4o-mini",
+        "api_base": "https://openrouter.ai/api/v1",
+        "api_key": os.getenv("OPENROUTER_API_KEY"),
+    },
+    "openrouter_llama_70b": {
+        "model": "openrouter/meta-llama/llama-3.3-70b-instruct",
+        "api_base": "https://openrouter.ai/api/v1",
+        "api_key": os.getenv("OPENROUTER_API_KEY"),
+    },
+}
diff --git a/end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb b/end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb
new file mode 100644
index 000000000..8e09c431b
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/notebooks/harness.ipynb
@@ -0,0 +1,3580 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Imports & Env Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "import sys\n",
+    "import os\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "\n",
+    "import dspy\n",
+    "sys.path.append(os.path.abspath('../'))\n",
+    "from benchmarks import llama_mmlu_pro, leaderboard_mmlu_pro, llama_mmlu"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NUM_THREADS = 48\n",
+    "\n",
+    "FEW_SHOTS = 5\n",
+    "\n",
+    "# See https://docs.litellm.ai/docs/providers/vllm for details\n",
+    "TASK_MODEL = dspy.LM(\n",
+    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
+    "    api_base = 'http://localhost:8000/v1' , # or api_base ?\n",
+    "    api_key = \"dummy\",\n",
+    "    # api_version: Optional[str] = None,\n",
+    "    # api_key: Optional[str] = None,\n",
+    "    # seed: Optional[int] = None,\n",
+    "    # max_tokens: Optional[int] = None,\n",
+    "    # timeout: Optional[Union[float, int]] = None,\n",
+    ")\n",
+    "PROMPT_MODEL = dspy.LM(\n",
+    "    \"hosted_vllm/meta-llama/Llama-3.3-70B-Instruct\",\n",
+    "    api_base = 'http://localhost:8000/v1', # or api_base ?\n",
+    "    api_key = \"dummy\",\n",
+    "\n",
+    "    # api_version: Optional[str] = None,\n",
+    "    # api_key: Optional[str] = None,\n",
+    "    # seed: Optional[int] = None,\n",
+    "    # max_tokens: Optional[int] = None,\n",
+    "    # timeout: Optional[Union[float, int]] = None,\n",
+    ")\n",
+    "\n",
+    "dspy.configure(lm=TASK_MODEL)\n",
+    "\n",
+    "# replace this with llama_mmlu_pro or whatever\n",
+    "benchmark = llama_mmlu\n",
+    "\n",
+    "# Without chain of thought:\n",
+    "# program = dspy.Predict(\n",
+    "#     benchmark.signature(\"\")\n",
+    "# )\n",
+    "\n",
+    "# With chain of thought:\n",
+    "program = dspy.ChainOfThought(\n",
+    "    benchmark.signature(\"You are a helpful assistant.\") # put your initial system prompt here, or leave blank\n",
+    ")\n",
+    "\n",
+    "evaluate = dspy.Evaluate(\n",
+    "    devset=[],\n",
+    "    max_errors = 500,\n",
+    "    metric=benchmark.metric,\n",
+    "    num_threads=NUM_THREADS,\n",
+    "    display_progress=True,\n",
+    "    display_table=True,\n",
+    "    return_all_scores=True,\n",
+    "    return_outputs=True,\n",
+    "    provide_traceback=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1403, 1263, 11369)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "trainset, valset, testset = benchmark.datasets(\n",
+    "    train_size=0.1,\n",
+    "    validation_size=0.1,\n",
+    ")\n",
+    "\n",
+    "len(trainset), len(valset), len(testset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Baseline Benchmark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BASE PROMPT:\n",
+      " You are a helpful assistant.\n",
+      "CPU times: user 270 μs, sys: 7 μs, total: 277 μs\n",
+      "Wall time: 231 μs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "print(\"BASE PROMPT:\\n\", program.signature.instructions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BEST EXAMPLES:\n",
+      " []\n",
+      "CPU times: user 107 μs, sys: 0 ns, total: 107 μs\n",
+      "Wall time: 110 μs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "print(\"BEST EXAMPLES:\\n\", program.demos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Starting execution...\n",
+      "Average Metric: 35.00 / 40 (87.5%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:21<00:00,  1.84it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/02/03 10:43:12 INFO dspy.evaluate.evaluate: Average Metric: 35 / 40 (87.5%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>options</th>\n",
+       "      <th>example_answer</th>\n",
+       "      <th>reasoning</th>\n",
+       "      <th>pred_answer</th>\n",
+       "      <th>metric</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>The first hominids could be described as:</td>\n",
+       "      <td>{'A': 'quadrupedal knappers.', 'B': 'quadrupedal pongids.', 'C': '...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>The first hominids are known for their transition from a quadruped...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Investment demand most likely increases when</td>\n",
+       "      <td>{'A': 'real GDP decreases.', 'B': 'the cost of acquiring and maint...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>Investment demand is influenced by several factors, including expe...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Which of the following is a legitimate threat to societal security?</td>\n",
+       "      <td>{'A': 'Political policies designed to alter demographic characteri...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>To determine a legitimate threat to societal security, we must con...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Long-run aggregate supply is most likely to increase as the result of</td>\n",
+       "      <td>{'A': 'an increase in the real interest rate', 'B': 'increased inv...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The long-run aggregate supply (LRAS) curve represents the total am...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>As of 2013, share of people in the India who think political parti...</td>\n",
+       "      <td>{'A': '26%', 'B': '46%', 'C': '66%', 'D': '86%'}</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To answer this question, we need to consider the available data an...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>This question refers to the following information. \"I travelled th...</td>\n",
+       "      <td>{'A': 'The ability of commerce to foster cultural diffusion', 'B':...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>The journal entry by Ibn Battuta describes the city of 'Aden as a ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>A woman is engaged in the retail sale of widgets throughout the Un...</td>\n",
+       "      <td>{'A': \"No, because the woman's order was too indefinite to constit...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>The woman's purchase order constituted an offer to the manufacture...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
+       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>In Scenario 1, the main character kills spiders because their frie...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>A 67-year-old woman has had fatigue, dry skin, brittle hair, swell...</td>\n",
+       "      <td>{'A': 'Chronic lymphocytic thyroiditis (Hashimoto disease)', 'B': ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>The patient's symptoms, such as fatigue, dry skin, brittle hair, s...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Weber said that the 'spirit of capitalism' could be traced back to:</td>\n",
+       "      <td>{'A': 'the movement towards religious pluralism', 'B': 'inspiratio...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To answer this question, we need to consider the ideas of Max Webe...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>This question refers to the following information. The history of ...</td>\n",
+       "      <td>{'A': 'The Oxford Movement', 'B': 'Materialism and economic determ...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The Communist Manifesto, written by Karl Marx and Friedrich Engels...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>A nongovernmental, not-for-profit organization held the following ...</td>\n",
+       "      <td>{'A': '$12,700', 'B': '$13,000', 'C': '$13,800', 'D': '$14,900'}</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To determine the amount of stock investments that should be report...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>If you were hired by a large company to develop a new training pro...</td>\n",
+       "      <td>{'A': 'needs analysis.', 'B': 'job evaluation.', 'C': 'summative e...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To develop an effective training program, it's crucial to understa...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>How were the first metals worked in South America?</td>\n",
+       "      <td>{'A': 'casting', 'B': 'hammering', 'C': 'smelting', 'D': 'all of t...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>The first metals worked in South America involved various techniqu...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>Under the Articles of Confederation, the national government had t...</td>\n",
+       "      <td>{'A': 'negotiate treaties', 'B': 'collect taxes', 'C': 'establish ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>The Articles of Confederation, which served as the first constitut...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>A large company has offices in two locations, one in New Jersey an...</td>\n",
+       "      <td>{'A': '$22,500 ', 'B': '$23,700 ', 'C': '$25,500 ', 'D': '$27,300 '}</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To find the mean salary paid to the office assistants in the compa...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>Which character on the TV show 'Friends' is a chef?</td>\n",
+       "      <td>{'A': 'Joey', 'B': 'Monica', 'C': 'Ross', 'D': 'Rachel'}</td>\n",
+       "      <td>B</td>\n",
+       "      <td>To answer this question, we need to consider the main characters o...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>Which of the following events INITIATES puberty?</td>\n",
+       "      <td>{'A': 'pituitary gland releases FSH', 'B': 'pituitary gland releas...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>The initiation of puberty is a complex process involving the hypot...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>Which of the boys on the TV show 'My Three Sons' is adopted?</td>\n",
+       "      <td>{'A': 'Mike', 'B': 'Ernie', 'C': 'Chip', 'D': 'Robbie'}</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The TV show 'My Three Sons' features a family with three boys. The...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>What is the minimum value of the expression x + 4z as a function d...</td>\n",
+       "      <td>{'A': '0', 'B': '-2', 'C': '-sqrt(34)', 'D': '-sqrt(35)'}</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To find the minimum value of the expression x + 4z subject to the ...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>What will happen to the equilibrium price and the equilibrium quan...</td>\n",
+       "      <td>{'A': 'The equilibrium price will rise and the equilibrium quantit...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>When producers of good A expect the price to be higher in the near...</td>\n",
+       "      <td>C</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>Construct a complete truth table for the following argument. Then,...</td>\n",
+       "      <td>{'A': 'Valid', 'B': 'Invalid. Counterexample when M and O are true...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To determine the validity of the argument, we first need to constr...</td>\n",
+       "      <td>B</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>This question refers to the following information. \"If any person ...</td>\n",
+       "      <td>{'A': 'rural and urban interests.', 'B': 'federal law and state la...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The passage describes a Pennsylvania law from 1826 that criminaliz...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>Which of the following is not an element of the marketing mix?</td>\n",
+       "      <td>{'A': 'Promotion.', 'B': 'Product.', 'C': 'Target market.', 'D': '...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>The marketing mix, also known as the 4 Ps, consists of Product, Pr...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>Which of the following must be done when universal screening data ...</td>\n",
+       "      <td>{'A': 'Changes must be made in the delivery of the core program.',...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>When universal screening data indicate that very few students are ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>A large man with red hair robbed a liquor store. Thereafter, a def...</td>\n",
+       "      <td>{'A': 'admissible as a prior identification.', 'B': \"admissible, f...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The corrections officer's testimony is being offered to prove that...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>Good X is exchanged in a competitive market. Which of the followin...</td>\n",
+       "      <td>{'A': 'If the demand curve is perfectly elastic, the price rises b...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>When an excise tax is imposed on the production of a good in a com...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>Of the following compounds, which is LEAST likely to behave as a L...</td>\n",
+       "      <td>{'A': 'BeCl2', 'B': 'MgCl2', 'C': 'ZnCl2', 'D': 'SCl2'}</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To determine which of the given compounds is least likely to behav...</td>\n",
+       "      <td>B</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>Mr. Cleary’s class and Ms. Ntuala’s class go to use the computer l...</td>\n",
+       "      <td>{'A': '2', 'B': '6', 'C': '10', 'D': '14'}</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To find the maximum number of students who can have a computer to ...</td>\n",
+       "      <td>{C}</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>As of December 1, year 2, a company obtained a $1,000,000 line of ...</td>\n",
+       "      <td>{'A': 'Current liabilities of $1,000,000; long-term liabilities of...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To determine the presentation of the company's debt in its classif...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>Use indirect truth tables to determine whether the following argum...</td>\n",
+       "      <td>{'A': 'Valid', 'B': 'Invalid. Counterexample when P, Q, R, and S a...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To determine the validity of the argument using indirect truth tab...</td>\n",
+       "      <td>C</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>Which expression represents the phrase below? 3 fewer than a numbe...</td>\n",
+       "      <td>{'A': '3-p', 'B': 'p+3', 'C': '3/p', 'D': 'p-3'}</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To represent the phrase \"3 fewer than a number, p\", we need to und...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>The influenza virus is mainly controlled in special \"risk\" sectors...</td>\n",
+       "      <td>{'A': 'Hygiene', 'B': 'Vaccination', 'C': 'Antiviral drugs', 'D': ...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The influenza virus can be controlled through various methods, but...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>What size of cannula would you use in a patient who needed a rapid...</td>\n",
+       "      <td>{'A': '18 gauge.', 'B': '20 gauge.', 'C': '22 gauge.', 'D': '24 ga...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To determine the correct size of cannula for a rapid blood transfu...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>Which one of the following is not a characteristic of a team?</td>\n",
+       "      <td>{'A': 'Minimal and formal knowledge sharing', 'B': 'Collective out...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To determine which one of the following is not a characteristic of...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>When developing a plan of care relating to the management of a per...</td>\n",
+       "      <td>{'A': 'physical and pharmacological needs only.', 'B': 'physical a...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To develop an effective plan of care for managing a person's pain,...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>Which of the following teenagers have the highest pregnancy rates?</td>\n",
+       "      <td>{'A': 'U.S.', 'B': 'Canadian', 'C': 'Swedish', 'D': 'French'}</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To answer this question, we need to consider the available data on...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>Based on the analysis of oxygen isotope ratios in the shells of ga...</td>\n",
+       "      <td>{'A': 'It became increasingly complex due to increased rainfall an...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The analysis of oxygen isotope ratios in the shells of gastropods ...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>Which is a 'rock' under the UN Convention of the Law of the Sea (U...</td>\n",
+       "      <td>{'A': 'Rocks are the islands that are less than 10 square miles', ...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To answer this question, we need to understand the definition of a...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>The Gravitron is a carnival ride that looks like a large cylinder....</td>\n",
+       "      <td>{'A': 'μv^2/(rg)', 'B': 'r^2v^2/(μg)', 'C': 'rg/(μv^2)', 'D': 'Non...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To solve this problem, we need to consider the forces acting on a ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                 question  \\\n",
+       "0                               The first hominids could be described as:   \n",
+       "1                            Investment demand most likely increases when   \n",
+       "2     Which of the following is a legitimate threat to societal security?   \n",
+       "3   Long-run aggregate supply is most likely to increase as the result of   \n",
+       "4   As of 2013, share of people in the India who think political parti...   \n",
+       "5   This question refers to the following information. \"I travelled th...   \n",
+       "6   A woman is engaged in the retail sale of widgets throughout the Un...   \n",
+       "7   For which of these two scenarios does the main character (who uses...   \n",
+       "8   A 67-year-old woman has had fatigue, dry skin, brittle hair, swell...   \n",
+       "9     Weber said that the 'spirit of capitalism' could be traced back to:   \n",
+       "10  This question refers to the following information. The history of ...   \n",
+       "11  A nongovernmental, not-for-profit organization held the following ...   \n",
+       "12  If you were hired by a large company to develop a new training pro...   \n",
+       "13                     How were the first metals worked in South America?   \n",
+       "14  Under the Articles of Confederation, the national government had t...   \n",
+       "15  A large company has offices in two locations, one in New Jersey an...   \n",
+       "16                    Which character on the TV show 'Friends' is a chef?   \n",
+       "17                       Which of the following events INITIATES puberty?   \n",
+       "18           Which of the boys on the TV show 'My Three Sons' is adopted?   \n",
+       "19  What is the minimum value of the expression x + 4z as a function d...   \n",
+       "20  What will happen to the equilibrium price and the equilibrium quan...   \n",
+       "21  Construct a complete truth table for the following argument. Then,...   \n",
+       "22  This question refers to the following information. \"If any person ...   \n",
+       "23         Which of the following is not an element of the marketing mix?   \n",
+       "24  Which of the following must be done when universal screening data ...   \n",
+       "25  A large man with red hair robbed a liquor store. Thereafter, a def...   \n",
+       "26  Good X is exchanged in a competitive market. Which of the followin...   \n",
+       "27  Of the following compounds, which is LEAST likely to behave as a L...   \n",
+       "28  Mr. Cleary’s class and Ms. Ntuala’s class go to use the computer l...   \n",
+       "29  As of December 1, year 2, a company obtained a $1,000,000 line of ...   \n",
+       "30  Use indirect truth tables to determine whether the following argum...   \n",
+       "31  Which expression represents the phrase below? 3 fewer than a numbe...   \n",
+       "32  The influenza virus is mainly controlled in special \"risk\" sectors...   \n",
+       "33  What size of cannula would you use in a patient who needed a rapid...   \n",
+       "34          Which one of the following is not a characteristic of a team?   \n",
+       "35  When developing a plan of care relating to the management of a per...   \n",
+       "36     Which of the following teenagers have the highest pregnancy rates?   \n",
+       "37  Based on the analysis of oxygen isotope ratios in the shells of ga...   \n",
+       "38  Which is a 'rock' under the UN Convention of the Law of the Sea (U...   \n",
+       "39  The Gravitron is a carnival ride that looks like a large cylinder....   \n",
+       "\n",
+       "                                                                  options  \\\n",
+       "0   {'A': 'quadrupedal knappers.', 'B': 'quadrupedal pongids.', 'C': '...   \n",
+       "1   {'A': 'real GDP decreases.', 'B': 'the cost of acquiring and maint...   \n",
+       "2   {'A': 'Political policies designed to alter demographic characteri...   \n",
+       "3   {'A': 'an increase in the real interest rate', 'B': 'increased inv...   \n",
+       "4                        {'A': '26%', 'B': '46%', 'C': '66%', 'D': '86%'}   \n",
+       "5   {'A': 'The ability of commerce to foster cultural diffusion', 'B':...   \n",
+       "6   {'A': \"No, because the woman's order was too indefinite to constit...   \n",
+       "7   {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
+       "8   {'A': 'Chronic lymphocytic thyroiditis (Hashimoto disease)', 'B': ...   \n",
+       "9   {'A': 'the movement towards religious pluralism', 'B': 'inspiratio...   \n",
+       "10  {'A': 'The Oxford Movement', 'B': 'Materialism and economic determ...   \n",
+       "11       {'A': '$12,700', 'B': '$13,000', 'C': '$13,800', 'D': '$14,900'}   \n",
+       "12  {'A': 'needs analysis.', 'B': 'job evaluation.', 'C': 'summative e...   \n",
+       "13  {'A': 'casting', 'B': 'hammering', 'C': 'smelting', 'D': 'all of t...   \n",
+       "14  {'A': 'negotiate treaties', 'B': 'collect taxes', 'C': 'establish ...   \n",
+       "15   {'A': '$22,500 ', 'B': '$23,700 ', 'C': '$25,500 ', 'D': '$27,300 '}   \n",
+       "16               {'A': 'Joey', 'B': 'Monica', 'C': 'Ross', 'D': 'Rachel'}   \n",
+       "17  {'A': 'pituitary gland releases FSH', 'B': 'pituitary gland releas...   \n",
+       "18                {'A': 'Mike', 'B': 'Ernie', 'C': 'Chip', 'D': 'Robbie'}   \n",
+       "19              {'A': '0', 'B': '-2', 'C': '-sqrt(34)', 'D': '-sqrt(35)'}   \n",
+       "20  {'A': 'The equilibrium price will rise and the equilibrium quantit...   \n",
+       "21  {'A': 'Valid', 'B': 'Invalid. Counterexample when M and O are true...   \n",
+       "22  {'A': 'rural and urban interests.', 'B': 'federal law and state la...   \n",
+       "23  {'A': 'Promotion.', 'B': 'Product.', 'C': 'Target market.', 'D': '...   \n",
+       "24  {'A': 'Changes must be made in the delivery of the core program.',...   \n",
+       "25  {'A': 'admissible as a prior identification.', 'B': \"admissible, f...   \n",
+       "26  {'A': 'If the demand curve is perfectly elastic, the price rises b...   \n",
+       "27                {'A': 'BeCl2', 'B': 'MgCl2', 'C': 'ZnCl2', 'D': 'SCl2'}   \n",
+       "28                             {'A': '2', 'B': '6', 'C': '10', 'D': '14'}   \n",
+       "29  {'A': 'Current liabilities of $1,000,000; long-term liabilities of...   \n",
+       "30  {'A': 'Valid', 'B': 'Invalid. Counterexample when P, Q, R, and S a...   \n",
+       "31                       {'A': '3-p', 'B': 'p+3', 'C': '3/p', 'D': 'p-3'}   \n",
+       "32  {'A': 'Hygiene', 'B': 'Vaccination', 'C': 'Antiviral drugs', 'D': ...   \n",
+       "33  {'A': '18 gauge.', 'B': '20 gauge.', 'C': '22 gauge.', 'D': '24 ga...   \n",
+       "34  {'A': 'Minimal and formal knowledge sharing', 'B': 'Collective out...   \n",
+       "35  {'A': 'physical and pharmacological needs only.', 'B': 'physical a...   \n",
+       "36          {'A': 'U.S.', 'B': 'Canadian', 'C': 'Swedish', 'D': 'French'}   \n",
+       "37  {'A': 'It became increasingly complex due to increased rainfall an...   \n",
+       "38  {'A': 'Rocks are the islands that are less than 10 square miles', ...   \n",
+       "39  {'A': 'μv^2/(rg)', 'B': 'r^2v^2/(μg)', 'C': 'rg/(μv^2)', 'D': 'Non...   \n",
+       "\n",
+       "   example_answer  \\\n",
+       "0               C   \n",
+       "1               C   \n",
+       "2               B   \n",
+       "3               B   \n",
+       "4               D   \n",
+       "5               A   \n",
+       "6               D   \n",
+       "7               D   \n",
+       "8               A   \n",
+       "9               D   \n",
+       "10              B   \n",
+       "11              D   \n",
+       "12              A   \n",
+       "13              D   \n",
+       "14              A   \n",
+       "15              D   \n",
+       "16              B   \n",
+       "17              C   \n",
+       "18              B   \n",
+       "19              C   \n",
+       "20              A   \n",
+       "21              A   \n",
+       "22              B   \n",
+       "23              C   \n",
+       "24              A   \n",
+       "25              B   \n",
+       "26              D   \n",
+       "27              D   \n",
+       "28              C   \n",
+       "29              C   \n",
+       "30              A   \n",
+       "31              D   \n",
+       "32              B   \n",
+       "33              A   \n",
+       "34              A   \n",
+       "35              C   \n",
+       "36              A   \n",
+       "37              B   \n",
+       "38              C   \n",
+       "39              A   \n",
+       "\n",
+       "                                                                reasoning  \\\n",
+       "0   The first hominids are known for their transition from a quadruped...   \n",
+       "1   Investment demand is influenced by several factors, including expe...   \n",
+       "2   To determine a legitimate threat to societal security, we must con...   \n",
+       "3   The long-run aggregate supply (LRAS) curve represents the total am...   \n",
+       "4   To answer this question, we need to consider the available data an...   \n",
+       "5   The journal entry by Ibn Battuta describes the city of 'Aden as a ...   \n",
+       "6   The woman's purchase order constituted an offer to the manufacture...   \n",
+       "7   In Scenario 1, the main character kills spiders because their frie...   \n",
+       "8   The patient's symptoms, such as fatigue, dry skin, brittle hair, s...   \n",
+       "9   To answer this question, we need to consider the ideas of Max Webe...   \n",
+       "10  The Communist Manifesto, written by Karl Marx and Friedrich Engels...   \n",
+       "11  To determine the amount of stock investments that should be report...   \n",
+       "12  To develop an effective training program, it's crucial to understa...   \n",
+       "13  The first metals worked in South America involved various techniqu...   \n",
+       "14  The Articles of Confederation, which served as the first constitut...   \n",
+       "15  To find the mean salary paid to the office assistants in the compa...   \n",
+       "16  To answer this question, we need to consider the main characters o...   \n",
+       "17  The initiation of puberty is a complex process involving the hypot...   \n",
+       "18  The TV show 'My Three Sons' features a family with three boys. The...   \n",
+       "19  To find the minimum value of the expression x + 4z subject to the ...   \n",
+       "20  When producers of good A expect the price to be higher in the near...   \n",
+       "21  To determine the validity of the argument, we first need to constr...   \n",
+       "22  The passage describes a Pennsylvania law from 1826 that criminaliz...   \n",
+       "23  The marketing mix, also known as the 4 Ps, consists of Product, Pr...   \n",
+       "24  When universal screening data indicate that very few students are ...   \n",
+       "25  The corrections officer's testimony is being offered to prove that...   \n",
+       "26  When an excise tax is imposed on the production of a good in a com...   \n",
+       "27  To determine which of the given compounds is least likely to behav...   \n",
+       "28  To find the maximum number of students who can have a computer to ...   \n",
+       "29  To determine the presentation of the company's debt in its classif...   \n",
+       "30  To determine the validity of the argument using indirect truth tab...   \n",
+       "31  To represent the phrase \"3 fewer than a number, p\", we need to und...   \n",
+       "32  The influenza virus can be controlled through various methods, but...   \n",
+       "33  To determine the correct size of cannula for a rapid blood transfu...   \n",
+       "34  To determine which one of the following is not a characteristic of...   \n",
+       "35  To develop an effective plan of care for managing a person's pain,...   \n",
+       "36  To answer this question, we need to consider the available data on...   \n",
+       "37  The analysis of oxygen isotope ratios in the shells of gastropods ...   \n",
+       "38  To answer this question, we need to understand the definition of a...   \n",
+       "39  To solve this problem, we need to consider the forces acting on a ...   \n",
+       "\n",
+       "   pred_answer     metric  \n",
+       "0            C  ✔️ [True]  \n",
+       "1            C  ✔️ [True]  \n",
+       "2            B  ✔️ [True]  \n",
+       "3            B  ✔️ [True]  \n",
+       "4            D  ✔️ [True]  \n",
+       "5            A  ✔️ [True]  \n",
+       "6            D  ✔️ [True]  \n",
+       "7            D  ✔️ [True]  \n",
+       "8            A  ✔️ [True]  \n",
+       "9            D  ✔️ [True]  \n",
+       "10           B  ✔️ [True]  \n",
+       "11           D  ✔️ [True]  \n",
+       "12           A  ✔️ [True]  \n",
+       "13           D  ✔️ [True]  \n",
+       "14           A  ✔️ [True]  \n",
+       "15           D  ✔️ [True]  \n",
+       "16           B  ✔️ [True]  \n",
+       "17           C  ✔️ [True]  \n",
+       "18           B  ✔️ [True]  \n",
+       "19           C  ✔️ [True]  \n",
+       "20           C             \n",
+       "21           B             \n",
+       "22           B  ✔️ [True]  \n",
+       "23           C  ✔️ [True]  \n",
+       "24           A  ✔️ [True]  \n",
+       "25           B  ✔️ [True]  \n",
+       "26           D  ✔️ [True]  \n",
+       "27           B             \n",
+       "28         {C}             \n",
+       "29           C  ✔️ [True]  \n",
+       "30           C             \n",
+       "31           D  ✔️ [True]  \n",
+       "32           B  ✔️ [True]  \n",
+       "33           A  ✔️ [True]  \n",
+       "34           A  ✔️ [True]  \n",
+       "35           C  ✔️ [True]  \n",
+       "36           A  ✔️ [True]  \n",
+       "37           B  ✔️ [True]  \n",
+       "38           C  ✔️ [True]  \n",
+       "39           A  ✔️ [True]  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 842 ms, sys: 161 ms, total: 1 s\n",
+      "Wall time: 22.1 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "print(\"Starting execution...\")\n",
+    "# eval_subset_size = len(testset)\n",
+    "score, results, all_scores = evaluate(\n",
+    "    program,\n",
+    "    devset=testset[:40],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "\n",
+    "with open(\"my_results.csv\", mode=\"w\", newline=\"\", encoding=\"utf-8\") as f:\n",
+    "    writer = csv.writer(f)\n",
+    "    # If `results` is a list of lists, just write rows:\n",
+    "    for row in results:\n",
+    "        writer.writerow(row)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Results DataFrame:\n",
+      "    Example_Index  \\\n",
+      "0               0   \n",
+      "1               1   \n",
+      "2               2   \n",
+      "3               3   \n",
+      "4               4   \n",
+      "5               5   \n",
+      "6               6   \n",
+      "7               7   \n",
+      "8               8   \n",
+      "9               9   \n",
+      "10             10   \n",
+      "11             11   \n",
+      "12             12   \n",
+      "13             13   \n",
+      "14             14   \n",
+      "15             15   \n",
+      "16             16   \n",
+      "17             17   \n",
+      "18             18   \n",
+      "19             19   \n",
+      "20             20   \n",
+      "21             21   \n",
+      "22             22   \n",
+      "23             23   \n",
+      "24             24   \n",
+      "25             25   \n",
+      "26             26   \n",
+      "27             27   \n",
+      "28             28   \n",
+      "29             29   \n",
+      "30             30   \n",
+      "31             31   \n",
+      "32             32   \n",
+      "33             33   \n",
+      "34             34   \n",
+      "35             35   \n",
+      "36             36   \n",
+      "37             37   \n",
+      "38             38   \n",
+      "39             39   \n",
+      "40             40   \n",
+      "41             41   \n",
+      "42             42   \n",
+      "43             43   \n",
+      "44             44   \n",
+      "45             45   \n",
+      "46             46   \n",
+      "47             47   \n",
+      "48             48   \n",
+      "49             49   \n",
+      "50             50   \n",
+      "51             51   \n",
+      "52             52   \n",
+      "53             53   \n",
+      "54             54   \n",
+      "55             55   \n",
+      "56             56   \n",
+      "57             57   \n",
+      "58             58   \n",
+      "\n",
+      "                                                               Prediction  \\\n",
+      "0   Prediction(\\n    reasoning=\"To solve this problem, we need to appl...   \n",
+      "1   Prediction(\\n    reasoning='The capacitance of a parallel-plate ca...   \n",
+      "2   Prediction(\\n    reasoning=\"To calculate the contribution to $U_{\\...   \n",
+      "3   Prediction(\\n    reasoning='To translate the given statement \"Abdu...   \n",
+      "4   Prediction(\\n    reasoning='The passage states that perestroika, o...   \n",
+      "5   Prediction(\\n    reasoning='The refractory period is a phase that ...   \n",
+      "6   Prediction(\\n    reasoning=\"To solve this problem, we need to cons...   \n",
+      "7   Prediction(\\n    reasoning=\"To find the weight of one truck, we fi...   \n",
+      "8   Prediction(\\n    reasoning='To estimate the magnitude of the elect...   \n",
+      "9   Prediction(\\n    reasoning=\"The patient presents with nausea, vomi...   \n",
+      "10  Prediction(\\n    reasoning=\"The builder properly rejected the ship...   \n",
+      "11  Prediction(\\n    reasoning=\"The man conveyed a right-of-way easeme...   \n",
+      "12  Prediction(\\n    reasoning='To answer this question, we need to co...   \n",
+      "13  Prediction(\\n    reasoning='To answer this question, we need to un...   \n",
+      "14  Prediction(\\n    reasoning='To determine the final angular momentu...   \n",
+      "15  Prediction(\\n    reasoning='To calculate the molecular weight of t...   \n",
+      "16  Prediction(\\n    reasoning=\"The appropriateness and usefulness of ...   \n",
+      "17  Prediction(\\n    reasoning='To find the angular magnification of t...   \n",
+      "18  Prediction(\\n    reasoning='Multiple-choice questions are often ea...   \n",
+      "19  Prediction(\\n    reasoning='The question describes a treatment pro...   \n",
+      "20  Prediction(\\n    reasoning='To solve this problem, we need to use ...   \n",
+      "21  Prediction(\\n    reasoning=\"Since the DNA molecule is 20 percent a...   \n",
+      "22  Prediction(\\n    reasoning='The process described has NADP+, ADP, ...   \n",
+      "23  Prediction(\\n    reasoning='The question describes a scenario wher...   \n",
+      "24  Prediction(\\n    reasoning='The lac operon is a genetic regulatory...   \n",
+      "25  Prediction(\\n    reasoning='To answer this question, we need to id...   \n",
+      "26  Prediction(\\n    reasoning='To find the total tax Mr. Howard pays,...   \n",
+      "27  Prediction(\\n    reasoning='The amplitude of a damped harmonic osc...   \n",
+      "28  Prediction(\\n    reasoning='The statement \"Men are better drivers ...   \n",
+      "29  Prediction(\\n    reasoning='To find the desired level of money bal...   \n",
+      "30  Prediction(\\n    reasoning='The clinical presentation described fo...   \n",
+      "31  Prediction(\\n    reasoning=\"To solve this problem, we first need t...   \n",
+      "32  Prediction(\\n    reasoning='To determine where Mr. Balfour should ...   \n",
+      "33  Prediction(\\n    reasoning=\"To solve this problem, we need to unde...   \n",
+      "34  Prediction(\\n    reasoning='The patient presents with symptoms of ...   \n",
+      "35  Prediction(\\n    reasoning='The \"sex-change\" in bacteria is relate...   \n",
+      "36  Prediction(\\n    reasoning='To answer this question, we need to co...   \n",
+      "37  Prediction(\\n    reasoning=\"To determine if the burger joint owner...   \n",
+      "38  Prediction(\\n    reasoning=\"To find the pH of a 0.1 M solution of ...   \n",
+      "39  Prediction(\\n    reasoning='The question describes a scenario wher...   \n",
+      "40  Prediction(\\n    reasoning='The common law principles regarding th...   \n",
+      "41  Prediction(\\n    reasoning='To find the concentration of the sodiu...   \n",
+      "42  Prediction(\\n    reasoning='LC oscillators, also known as tank cir...   \n",
+      "43  Prediction(\\n    reasoning='To determine which of the given proces...   \n",
+      "44  Prediction(\\n    reasoning=\"The marginal utility approach explains...   \n",
+      "45  Prediction(\\n    reasoning=\"To solve this problem, we need to calc...   \n",
+      "46  Prediction(\\n    reasoning=\"The correct answer is based on the pri...   \n",
+      "47  Prediction(\\n    reasoning=\"To find the posterior probability \\\\(P...   \n",
+      "48  Prediction(\\n    reasoning=\"To determine the day of the week on wh...   \n",
+      "49  Prediction(\\n    reasoning=\"To solve this problem, we need to unde...   \n",
+      "50  Prediction(\\n    reasoning='To calculate the total change in entro...   \n",
+      "51  Prediction(\\n    reasoning='To determine when the tenant\\'s statut...   \n",
+      "52  Prediction(\\n    reasoning=\"To solve this problem, we first need t...   \n",
+      "53  Prediction(\\n    reasoning=\"To determine which type of organisatio...   \n",
+      "54  Prediction(\\n    reasoning='To answer this question, we need to un...   \n",
+      "55  Prediction(\\n    reasoning=\"To estimate a numerical measurement of...   \n",
+      "56  Prediction(\\n    reasoning='Anscombe\\'s work focuses on the philos...   \n",
+      "57  Prediction(\\n    reasoning=\"The practice of requiring students to ...   \n",
+      "58  Prediction(\\n    reasoning=\"The production period in the history o...   \n",
+      "\n",
+      "    Score  \n",
+      "0    True  \n",
+      "1    True  \n",
+      "2   False  \n",
+      "3    True  \n",
+      "4    True  \n",
+      "5    True  \n",
+      "6    True  \n",
+      "7    True  \n",
+      "8   False  \n",
+      "9    True  \n",
+      "10   True  \n",
+      "11   True  \n",
+      "12   True  \n",
+      "13   True  \n",
+      "14   True  \n",
+      "15  False  \n",
+      "16   True  \n",
+      "17   True  \n",
+      "18   True  \n",
+      "19   True  \n",
+      "20  False  \n",
+      "21  False  \n",
+      "22   True  \n",
+      "23   True  \n",
+      "24   True  \n",
+      "25   True  \n",
+      "26   True  \n",
+      "27  False  \n",
+      "28   True  \n",
+      "29   True  \n",
+      "30   True  \n",
+      "31  False  \n",
+      "32   True  \n",
+      "33  False  \n",
+      "34   True  \n",
+      "35   True  \n",
+      "36  False  \n",
+      "37   True  \n",
+      "38  False  \n",
+      "39   True  \n",
+      "40   True  \n",
+      "41   True  \n",
+      "42  False  \n",
+      "43   True  \n",
+      "44   True  \n",
+      "45   True  \n",
+      "46   True  \n",
+      "47   True  \n",
+      "48  False  \n",
+      "49   True  \n",
+      "50   True  \n",
+      "51   True  \n",
+      "52  False  \n",
+      "53   True  \n",
+      "54   True  \n",
+      "55   True  \n",
+      "56   True  \n",
+      "57  False  \n",
+      "58  False  \n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.DataFrame([(i, str(pred), score) for i, (example, pred, score) in enumerate(results)],\n",
+    "                     columns=['Example_Index', 'Prediction', 'Score'])\n",
+    "print(\"\\nResults DataFrame:\")\n",
+    "print(df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "comparison_df = pd.DataFrame([{\n",
+    "        'Question': example.inputs()['question'][:100] + '...' if len(example.inputs()['question']) > 100 else example.inputs()['question'],\n",
+    "        'Predicted Answer': prediction.answer if hasattr(prediction, 'answer') else str(prediction),\n",
+    "        'Correct Answer': example.answer,\n",
+    "        'Is Correct': '✔️' if score else '❌'\n",
+    "} for example, prediction, score in results])\n",
+    "\n",
+    "\n",
+    "csv_filename = 'prediction_results.csv'\n",
+    "comparison_df.to_csv(csv_filename, index=False, encoding='utf-8-sig')  # utf-8-sig to handle special characters\n",
+    "print(f\"\\nResults saved to {csv_filename}\")\n",
+    "    \n",
+    "pd.set_option('display.max_colwidth', None)\n",
+    "print(\"\\nPredictions vs Actual Answers:\")\n",
+    "print(comparison_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "\n",
+    "print(\"Starting execution...\")\n",
+    "evaluate(\n",
+    "    program,\n",
+    "    devset=testset,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Optimize Subset + Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "logging.getLogger('dspy').setLevel(logging.DEBUG)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "subset_size = 200\n",
+    "optimizer = dspy.MIPROv2(\n",
+    "    metric=benchmark.metric,\n",
+    "    auto=\"light\",\n",
+    "    num_threads=NUM_THREADS,\n",
+    "    task_model=TASK_MODEL,\n",
+    "    prompt_model=PROMPT_MODEL,\n",
+    "    max_bootstrapped_demos=FEW_SHOTS * 2,\n",
+    "    max_labeled_demos=FEW_SHOTS,\n",
+    ")\n",
+    "\n",
+    "optimized_program = optimizer.compile(\n",
+    "    program,\n",
+    "    trainset=trainset[:subset_size],\n",
+    "    valset=valset[:subset_size],\n",
+    "    requires_permission_to_run=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BEST PROMPT:\n",
+      " You are a helpful assistant.\n",
+      "CPU times: user 286 μs, sys: 24 μs, total: 310 μs\n",
+      "Wall time: 265 μs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "print(\"BEST PROMPT:\\n\", optimized_program.signature.instructions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Module signature instructions:\n"
+     ]
+    },
+    {
+     "ename": "AttributeError",
+     "evalue": "'str' object has no attribute 'instructions'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[15], line 4\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(module, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msignature\u001b[39m\u001b[38;5;124m'\u001b[39m):\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mModule \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmodule_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m instructions:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m----> 4\u001b[0m     \u001b[38;5;28mprint\u001b[39m(\u001b[43mmodule\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msignature\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minstructions\u001b[49m)\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'str' object has no attribute 'instructions'"
+     ]
+    }
+   ],
+   "source": [
+    "for module_name, module in optimized_program.__dict__.items():\n",
+    "    if hasattr(module, 'signature'):\n",
+    "        print(f\"\\nModule {module_name} instructions:\")\n",
+    "        print(module.signature.instructions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BEST EXAMPLES:\n",
+      " []\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BEST EXAMPLES:\n",
+      " [\n",
+      "  {\n",
+      "    \"question\": \"Dr. Ryan is a psychotherapist in a small town. She receives a phone call from a man she was sexually involved with several years ago. The relationship lasted only three months and ended amicably. The man is now married and is having problems unrelated to their past relationship. He has called to see if he can begin seeing Dr. Ryan in therapy. Dr. Ryan should:\",\n",
+      "    \"options\": {\n",
+      "      \"A\": \"see the man but disclose their past relationship to a supervisor or colleague for transparency.\",\n",
+      "      \"B\": \"deny the man's request without providing a reason.\",\n",
+      "      \"C\": \"see the man only if their relationship ended more than two years ago and she determines that it will not interfere with her objectivity.\",\n",
+      "      \"D\": \"see the man only if she feels she can maintain professional boundaries despite their past.\",\n",
+      "      \"E\": \"refer the man to a colleague.\",\n",
+      "      \"F\": \"continue therapy with the man but avoid discussing their past relationship.\",\n",
+      "      \"G\": \"see the man only if she determines that their past relationship will not interfere with her objectivity.\",\n",
+      "      \"H\": \"provide therapy for the man over the phone or through online sessions to maintain physical boundaries.\",\n",
+      "      \"I\": \"see the man only if his wife is aware of their past relationship and consents to the therapy.\",\n",
+      "      \"J\": \"see the man but discuss the potential for con\\ufb02icts with him before beginning treatment.\"\n",
+      "    },\n",
+      "    \"answer\": \"E\",\n",
+      "    \"reasoning\": \"The American Psychological Association's Ethics Code states that psychologists should refrain from entering into a therapeutic relationship with individuals with whom they have a personal relationship that could impair their professional judgment or create a risk of exploitation. In this case, Dr. Ryan's past sexual relationship with the man could potentially create a dual relationship, which may impair her objectivity and professional judgment. While the relationship ended amicably and was several years ago, it is still important for Dr. Ryan to consider whether her past involvement with the man could influence her ability to provide unbiased and effective therapy. The most appropriate course of action would be for Dr. Ryan to refer the man to a colleague, as this would avoid any potential conflict of interest or dual relationship. This approach prioritizes the man's need for therapy while also maintaining the ethical standards of the profession.\",\n",
+      "    \"input_keys\": null\n",
+      "  },\n",
+      "  {\n",
+      "    \"question\": \"A writer sent a four-page synopsis of an idea for a new television series to a Hollywood producer. He sent it in response to an ad for new ideas in an industry publication. He discussed it with the producer's assistant in a phone call, and mentioned his expectation of compensation. She said, \\\"Well, of course, we always pay for a writer's work.\\\" She said she would go over it with her boss to see if he liked it. Several months later, the writer saw a casting call for a new series. The plot and characters were nearly identical to those described in his synopsis. He sued the producer for breach of contract. The producer defended by arguing that there was no contract. What is the likely ruling of the court?\",\n",
+      "    \"options\": {\n",
+      "      \"A\": \"The court will rule that there can be no contract as the writer didn't explicitly state his expectation for compensation for the idea itself.\",\n",
+      "      \"B\": \"The court will rule that there was an implied in fact contract between the parties based on the conduct that they manifested.\",\n",
+      "      \"C\": \"The court will rule that there was no consideration specifically mentioned and under those circumstances, the law viewed the synopsis as a gift to the producer.\",\n",
+      "      \"D\": \"The court will rule that there is an express contract because there was a meeting of the mind and mutual assent to the basic terms.\",\n",
+      "      \"E\": \"The court will rule that there was a unilateral contract, with the producer obligated to pay only if he used the idea.\",\n",
+      "      \"F\": \"The court will rule that the writer's case is invalid as there was no written agreement.\",\n",
+      "      \"G\": \"The court will rule that the producer's assistant lacked the authority to form a contract, and thus, no contract exists.\",\n",
+      "      \"H\": \"There were too many terms left out for this to be a contract, and it was too indefinite for the court to imply what the terms might have been.\",\n",
+      "      \"I\": \"The court will rule that the phone call between the writer and the producer's assistant formed a verbal contract.\",\n",
+      "      \"J\": null\n",
+      "    },\n",
+      "    \"answer\": \"B\",\n",
+      "    \"reasoning\": \"\",\n",
+      "    \"input_keys\": null\n",
+      "  },\n",
+      "  {\n",
+      "    \"question\": \"A test charge q C, moving with a velocityv= (i_x +i_y) m/sec, experiences no force in a region of electric and magnetic fields. If the magnetic flux density B= (i_x - 2i_z)Wb/m^2, findE.\",\n",
+      "    \"options\": {\n",
+      "      \"A\": \"(i_z - 2i_y) volts/m\",\n",
+      "      \"B\": \"(-i_x + i_y) volts/m\",\n",
+      "      \"C\": \"(3i_x - i_y - 2i_z) volts/m\",\n",
+      "      \"D\": \"(2ix - 2i_y +i_z) volts/m\",\n",
+      "      \"E\": \"(i_x - 2i_z) volts/m\",\n",
+      "      \"F\": \"(2i_x - i_y) volts/m\",\n",
+      "      \"G\": \"(i_x +i_y) volts/m\",\n",
+      "      \"H\": \"(2ix + 2i_y -i_z) volts/m\",\n",
+      "      \"I\": \"(i_y + i_z) volts/m\",\n",
+      "      \"J\": \"(i_x + 2i_y + i_z) volts/m\"\n",
+      "    },\n",
+      "    \"answer\": \"D\",\n",
+      "    \"reasoning\": \"\",\n",
+      "    \"input_keys\": null\n",
+      "  }\n",
+      "]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "\n",
+    "def example_to_dict(example):\n",
+    "    return {\n",
+    "        'question': example.question,\n",
+    "        'options': example.options,\n",
+    "        'answer': example.answer,\n",
+    "        'reasoning': example.reasoning,\n",
+    "        'input_keys': list(example.input_keys) if hasattr(example, 'input_keys') else None\n",
+    "    }\n",
+    "\n",
+    "examples_json = [example_to_dict(example) for example in optimized_program.demos]\n",
+    "print(\"BEST EXAMPLES:\\n\", json.dumps(examples_json, indent=2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 37.00 / 40 (92.5%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 40/40 [00:00<00:00, 2657.86it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/30 13:40:18 INFO dspy.evaluate.evaluate: Average Metric: 37 / 40 (92.5%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>options</th>\n",
+       "      <th>example_answer</th>\n",
+       "      <th>reasoning</th>\n",
+       "      <th>pred_answer</th>\n",
+       "      <th>metric</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
+       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To determine which scenario involves the main character doing some...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Which of the following is not a TV cartoon duo?</td>\n",
+       "      <td>{'A': 'Chip and Dale', 'B': 'Beavis and Butt-head', 'C': 'Simon an...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To determine which of the following is not a TV cartoon duo, we ne...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Market testing studies use __________ to carry out controlled expe...</td>\n",
+       "      <td>{'A': 'Test markets.', 'B': 'Data mining.', 'C': 'Analytical infor...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To answer this question, we need to understand the concept of mark...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Which of the following is the essential feature of Conduct Disorder</td>\n",
+       "      <td>{'A': 'Lack of appropriate guilt or remorse', 'B': 'Major rule or ...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>To answer this question, we need to consider the characteristics o...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
+       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To determine which scenario involves the main character doing some...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>An off-duty police officer was standing on a street corner waiting...</td>\n",
+       "      <td>{'A': 'assault with a deadly weapon.', 'B': 'involuntary manslaugh...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The police officer's actions, although intended to frighten the ma...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>There are 2,000 insurance agents licensed under the general licens...</td>\n",
+       "      <td>{'A': 'constitutional, because a state license is a privilege and ...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>The key issue here is whether the state statute revoking the insur...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>Millions of immigrant children who entered the United States learn...</td>\n",
+       "      <td>{'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>The concept described in the question involves immigrant children ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>This question refers to the following information. \"When we were k...</td>\n",
+       "      <td>{'A': 'The labor union movement.', 'B': 'The civil rights movement...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>The Port Huron Statement, as excerpted, discusses themes of social...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>Who is the eighth-century CE female poet worshipped throughout ma...</td>\n",
+       "      <td>{'A': 'Andal', 'B': 'Devi', 'C': 'Ganga', 'D': 'Kali'}</td>\n",
+       "      <td>A</td>\n",
+       "      <td>The question asks for an eighth-century CE female poet who is wors...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>Clifford and Lucia Pauling, in Senior View, told us that physical ...</td>\n",
+       "      <td>{'A': 'Are rapid and frightening', 'B': 'Can be offset by meditati...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To answer this question, we need to consider the context of physic...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>Which statement best describes one of Dworkin's central arguments ...</td>\n",
+       "      <td>{'A': 'Morality plays no role in the concept of law.', 'B': 'Moral...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To answer this question, we need to consider the central arguments...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Light that is not transmitted by opaque materials is</td>\n",
+       "      <td>{'A': 'reflected or converted to internal energy in the material.'...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>When light hits an opaque material, it does not pass through becau...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>Which of the following was not defined by Giddens (1998) as part o...</td>\n",
+       "      <td>{'A': 'the democratization of the family', 'B': 'putting an end to...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>To answer this question, we need to consider the key components of...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>The U.S. economy currently suffers a recessionary gap and a budget...</td>\n",
+       "      <td>{'A': 'Tax increase \\xa0\\xa0\\xa0 Demand rises \\xa0\\xa0\\xa0 Falling...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To address a recessionary gap and a budget deficit through fiscal ...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>A company president is concerned about the low motivation and sati...</td>\n",
+       "      <td>{'A': 'ERG theory', 'B': 'expectancy theory', 'C': 'equity theory'...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>The scenario describes a situation where a company president imple...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>What characteristic is not representative of a type IIb muscle fib...</td>\n",
+       "      <td>{'A': 'Low oxidative capacity', 'B': 'High fatigue resistance', 'C...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>To answer this question, we need to understand the characteristics...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>The energy for all forms of muscle contraction is provided by:</td>\n",
+       "      <td>{'A': 'ATP.', 'B': 'ADP.', 'C': 'phosphocreatine.', 'D': 'oxidativ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To answer this question, we need to understand the role of differe...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>The main factor preventing subsistence economies from advancing ec...</td>\n",
+       "      <td>{'A': 'a currency.', 'B': 'a well-connected transportation infrast...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To address this question, let's consider what subsistence economie...</td>\n",
+       "      <td>B</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>The primary research method used by developmental psychologists is</td>\n",
+       "      <td>{'A': 'case study', 'B': 'cross-sectional research', 'C': 'natural...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>Developmental psychologists often aim to understand how individual...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>Kevin wants shoes and grows turnips. Lisa wants turnips and makes ...</td>\n",
+       "      <td>{'A': 'Store of value', 'B': 'Unit of account', 'C': 'Medium of ex...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>In this scenario, Kevin, Lisa, and Bob have different needs and pr...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>The Federal Reserve implements an expansionary policy by doing whi...</td>\n",
+       "      <td>{'A': 'Buying Treasury bonds in the open market', 'B': 'Raising th...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To answer this question, we need to understand the tools the Feder...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>Why do political scientists identify the presidential elections of...</td>\n",
+       "      <td>{'A': 'The issues at stake in those elections were more important ...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>To answer this question, we need to understand what is meant by \"c...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>An entity engaged an accountant to review its financial statements...</td>\n",
+       "      <td>{'A': 'Withdrawn from the engagement because the entity has not be...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>When an accountant is engaged to review financial statements in ac...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>Which of these statements defines the Copenhagen School's view of ...</td>\n",
+       "      <td>{'A': 'Security is a socially constructed concept, relative to the...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>The Copenhagen School's view of security is rooted in the concept ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>An object of mass m1 experiences a linear, elastic collision with ...</td>\n",
+       "      <td>{'A': 'The final speed of object 1', 'B': 'The initial speed of ob...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To determine the mass of the second object in a linear, elastic co...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>This question refers to the following information. \"Thereupon it w...</td>\n",
+       "      <td>{'A': 'Portuguese explorers were convinced that the route to the N...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>The Treaty of Tordesillas, signed in 1494, was an agreement betwee...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>When both short-run aggregate supply and aggregate demand increase...</td>\n",
+       "      <td>{'A': 'The price level rises but real GDP falls.', 'B': 'Both the ...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>When both short-run aggregate supply and aggregate demand increase...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>This question refers to the following information. \"The challenge ...</td>\n",
+       "      <td>{'A': 'the Civil Rights Act of 1964', 'B': 'Medicare and Medicaid'...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To answer this question, we need to analyze the given text and opt...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>Ada Lovelace is credited with being the first person to have made ...</td>\n",
+       "      <td>{'A': 'a computer program', 'B': 'a souffle', 'C': 'a brassiere', ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To answer this question, we need to consider the historical contri...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>30</th>\n",
+       "      <td>How are new polyomaviruses detailed</td>\n",
+       "      <td>{'A': 'Shot gun sequencing', 'B': 'Cultivation in human neural cel...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To detail new polyomaviruses, we must consider the methods used fo...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>31</th>\n",
+       "      <td>What is the square root of 81 squared?</td>\n",
+       "      <td>{'A': '9', 'B': '27', 'C': '81', 'D': '729'}</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To find the square root of 81 squared, we first calculate 81 squar...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>32</th>\n",
+       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
+       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To determine which scenario involves the main character doing some...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>33</th>\n",
+       "      <td>This question refers to the following information. \"The conscience...</td>\n",
+       "      <td>{'A': 'Passage of the Pure Food and Water Act', 'B': 'Creation of ...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>The Progressive Party Platform of 1912 emphasizes the importance o...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>34</th>\n",
+       "      <td>A 50-year-old nurse who had been fired from his job at ahospital t...</td>\n",
+       "      <td>{'A': 'No, because sanctions are not proper against the attorney o...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The court is likely to grant the hospital's motion for sanctions b...</td>\n",
+       "      <td>C</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>35</th>\n",
+       "      <td>According to John Money, __________ refers to a person's private i...</td>\n",
+       "      <td>{'A': 'Gender identity', 'B': 'Sexual nomenclature', 'C': 'Gender ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To answer this question, we need to understand the concept of gend...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>36</th>\n",
+       "      <td>Rounded to the nearest 10,000, the population of Louisiana was 4,5...</td>\n",
+       "      <td>{'A': '4,500,321', 'B': '4,524,491', 'C': '4,533,372', 'D': '4,535...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>To find the actual population of Louisiana in 2010, we need to con...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>37</th>\n",
+       "      <td>Compared with the sound you hear from the siren of a stationary fi...</td>\n",
+       "      <td>{'A': 'speed', 'B': 'frequency', 'C': 'wavelength', 'D': 'All of t...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>The question is about the Doppler effect, which describes how the ...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>38</th>\n",
+       "      <td>______ regulation goes further than the ______ that CSOs have with...</td>\n",
+       "      <td>{'A': 'Legal, Interactions, Mechanics', 'B': 'Civil, Relations, Ou...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>To solve this, let's break down the components of the sentence and...</td>\n",
+       "      <td>C</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>39</th>\n",
+       "      <td>Aristotle saw his theory of the virtues as:</td>\n",
+       "      <td>{'A': 'final and complete.', 'B': 'open-ended and revisable.', 'C'...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>Aristotle's theory of the virtues, as outlined in his Nicomachean ...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                 question  \\\n",
+       "0   For which of these two scenarios does the main character (who uses...   \n",
+       "1                         Which of the following is not a TV cartoon duo?   \n",
+       "2   Market testing studies use __________ to carry out controlled expe...   \n",
+       "3     Which of the following is the essential feature of Conduct Disorder   \n",
+       "4   For which of these two scenarios does the main character (who uses...   \n",
+       "5   An off-duty police officer was standing on a street corner waiting...   \n",
+       "6   There are 2,000 insurance agents licensed under the general licens...   \n",
+       "7   Millions of immigrant children who entered the United States learn...   \n",
+       "8   This question refers to the following information. \"When we were k...   \n",
+       "9    Who is the eighth-century CE female poet worshipped throughout ma...   \n",
+       "10  Clifford and Lucia Pauling, in Senior View, told us that physical ...   \n",
+       "11  Which statement best describes one of Dworkin's central arguments ...   \n",
+       "12                   Light that is not transmitted by opaque materials is   \n",
+       "13  Which of the following was not defined by Giddens (1998) as part o...   \n",
+       "14  The U.S. economy currently suffers a recessionary gap and a budget...   \n",
+       "15  A company president is concerned about the low motivation and sati...   \n",
+       "16  What characteristic is not representative of a type IIb muscle fib...   \n",
+       "17         The energy for all forms of muscle contraction is provided by:   \n",
+       "18  The main factor preventing subsistence economies from advancing ec...   \n",
+       "19     The primary research method used by developmental psychologists is   \n",
+       "20  Kevin wants shoes and grows turnips. Lisa wants turnips and makes ...   \n",
+       "21  The Federal Reserve implements an expansionary policy by doing whi...   \n",
+       "22  Why do political scientists identify the presidential elections of...   \n",
+       "23  An entity engaged an accountant to review its financial statements...   \n",
+       "24  Which of these statements defines the Copenhagen School's view of ...   \n",
+       "25  An object of mass m1 experiences a linear, elastic collision with ...   \n",
+       "26  This question refers to the following information. \"Thereupon it w...   \n",
+       "27  When both short-run aggregate supply and aggregate demand increase...   \n",
+       "28  This question refers to the following information. \"The challenge ...   \n",
+       "29  Ada Lovelace is credited with being the first person to have made ...   \n",
+       "30                                    How are new polyomaviruses detailed   \n",
+       "31                                 What is the square root of 81 squared?   \n",
+       "32  For which of these two scenarios does the main character (who uses...   \n",
+       "33  This question refers to the following information. \"The conscience...   \n",
+       "34  A 50-year-old nurse who had been fired from his job at ahospital t...   \n",
+       "35  According to John Money, __________ refers to a person's private i...   \n",
+       "36  Rounded to the nearest 10,000, the population of Louisiana was 4,5...   \n",
+       "37  Compared with the sound you hear from the siren of a stationary fi...   \n",
+       "38  ______ regulation goes further than the ______ that CSOs have with...   \n",
+       "39                            Aristotle saw his theory of the virtues as:   \n",
+       "\n",
+       "                                                                  options  \\\n",
+       "0   {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
+       "1   {'A': 'Chip and Dale', 'B': 'Beavis and Butt-head', 'C': 'Simon an...   \n",
+       "2   {'A': 'Test markets.', 'B': 'Data mining.', 'C': 'Analytical infor...   \n",
+       "3   {'A': 'Lack of appropriate guilt or remorse', 'B': 'Major rule or ...   \n",
+       "4   {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
+       "5   {'A': 'assault with a deadly weapon.', 'B': 'involuntary manslaugh...   \n",
+       "6   {'A': 'constitutional, because a state license is a privilege and ...   \n",
+       "7   {'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st...   \n",
+       "8   {'A': 'The labor union movement.', 'B': 'The civil rights movement...   \n",
+       "9                  {'A': 'Andal', 'B': 'Devi', 'C': 'Ganga', 'D': 'Kali'}   \n",
+       "10  {'A': 'Are rapid and frightening', 'B': 'Can be offset by meditati...   \n",
+       "11  {'A': 'Morality plays no role in the concept of law.', 'B': 'Moral...   \n",
+       "12  {'A': 'reflected or converted to internal energy in the material.'...   \n",
+       "13  {'A': 'the democratization of the family', 'B': 'putting an end to...   \n",
+       "14  {'A': 'Tax increase \\xa0\\xa0\\xa0 Demand rises \\xa0\\xa0\\xa0 Falling...   \n",
+       "15  {'A': 'ERG theory', 'B': 'expectancy theory', 'C': 'equity theory'...   \n",
+       "16  {'A': 'Low oxidative capacity', 'B': 'High fatigue resistance', 'C...   \n",
+       "17  {'A': 'ATP.', 'B': 'ADP.', 'C': 'phosphocreatine.', 'D': 'oxidativ...   \n",
+       "18  {'A': 'a currency.', 'B': 'a well-connected transportation infrast...   \n",
+       "19  {'A': 'case study', 'B': 'cross-sectional research', 'C': 'natural...   \n",
+       "20  {'A': 'Store of value', 'B': 'Unit of account', 'C': 'Medium of ex...   \n",
+       "21  {'A': 'Buying Treasury bonds in the open market', 'B': 'Raising th...   \n",
+       "22  {'A': 'The issues at stake in those elections were more important ...   \n",
+       "23  {'A': 'Withdrawn from the engagement because the entity has not be...   \n",
+       "24  {'A': 'Security is a socially constructed concept, relative to the...   \n",
+       "25  {'A': 'The final speed of object 1', 'B': 'The initial speed of ob...   \n",
+       "26  {'A': 'Portuguese explorers were convinced that the route to the N...   \n",
+       "27  {'A': 'The price level rises but real GDP falls.', 'B': 'Both the ...   \n",
+       "28  {'A': 'the Civil Rights Act of 1964', 'B': 'Medicare and Medicaid'...   \n",
+       "29  {'A': 'a computer program', 'B': 'a souffle', 'C': 'a brassiere', ...   \n",
+       "30  {'A': 'Shot gun sequencing', 'B': 'Cultivation in human neural cel...   \n",
+       "31                           {'A': '9', 'B': '27', 'C': '81', 'D': '729'}   \n",
+       "32  {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
+       "33  {'A': 'Passage of the Pure Food and Water Act', 'B': 'Creation of ...   \n",
+       "34  {'A': 'No, because sanctions are not proper against the attorney o...   \n",
+       "35  {'A': 'Gender identity', 'B': 'Sexual nomenclature', 'C': 'Gender ...   \n",
+       "36  {'A': '4,500,321', 'B': '4,524,491', 'C': '4,533,372', 'D': '4,535...   \n",
+       "37  {'A': 'speed', 'B': 'frequency', 'C': 'wavelength', 'D': 'All of t...   \n",
+       "38  {'A': 'Legal, Interactions, Mechanics', 'B': 'Civil, Relations, Ou...   \n",
+       "39  {'A': 'final and complete.', 'B': 'open-ended and revisable.', 'C'...   \n",
+       "\n",
+       "   example_answer  \\\n",
+       "0               A   \n",
+       "1               C   \n",
+       "2               A   \n",
+       "3               B   \n",
+       "4               C   \n",
+       "5               B   \n",
+       "6               C   \n",
+       "7               A   \n",
+       "8               D   \n",
+       "9               A   \n",
+       "10              C   \n",
+       "11              D   \n",
+       "12              A   \n",
+       "13              B   \n",
+       "14              C   \n",
+       "15              D   \n",
+       "16              B   \n",
+       "17              A   \n",
+       "18              D   \n",
+       "19              B   \n",
+       "20              C   \n",
+       "21              A   \n",
+       "22              B   \n",
+       "23              C   \n",
+       "24              A   \n",
+       "25              D   \n",
+       "26              D   \n",
+       "27              D   \n",
+       "28              D   \n",
+       "29              A   \n",
+       "30              C   \n",
+       "31              C   \n",
+       "32              C   \n",
+       "33              D   \n",
+       "34              B   \n",
+       "35              A   \n",
+       "36              C   \n",
+       "37              B   \n",
+       "38              B   \n",
+       "39              B   \n",
+       "\n",
+       "                                                                reasoning  \\\n",
+       "0   To determine which scenario involves the main character doing some...   \n",
+       "1   To determine which of the following is not a TV cartoon duo, we ne...   \n",
+       "2   To answer this question, we need to understand the concept of mark...   \n",
+       "3   To answer this question, we need to consider the characteristics o...   \n",
+       "4   To determine which scenario involves the main character doing some...   \n",
+       "5   The police officer's actions, although intended to frighten the ma...   \n",
+       "6   The key issue here is whether the state statute revoking the insur...   \n",
+       "7   The concept described in the question involves immigrant children ...   \n",
+       "8   The Port Huron Statement, as excerpted, discusses themes of social...   \n",
+       "9   The question asks for an eighth-century CE female poet who is wors...   \n",
+       "10  To answer this question, we need to consider the context of physic...   \n",
+       "11  To answer this question, we need to consider the central arguments...   \n",
+       "12  When light hits an opaque material, it does not pass through becau...   \n",
+       "13  To answer this question, we need to consider the key components of...   \n",
+       "14  To address a recessionary gap and a budget deficit through fiscal ...   \n",
+       "15  The scenario describes a situation where a company president imple...   \n",
+       "16  To answer this question, we need to understand the characteristics...   \n",
+       "17  To answer this question, we need to understand the role of differe...   \n",
+       "18  To address this question, let's consider what subsistence economie...   \n",
+       "19  Developmental psychologists often aim to understand how individual...   \n",
+       "20  In this scenario, Kevin, Lisa, and Bob have different needs and pr...   \n",
+       "21  To answer this question, we need to understand the tools the Feder...   \n",
+       "22  To answer this question, we need to understand what is meant by \"c...   \n",
+       "23  When an accountant is engaged to review financial statements in ac...   \n",
+       "24  The Copenhagen School's view of security is rooted in the concept ...   \n",
+       "25  To determine the mass of the second object in a linear, elastic co...   \n",
+       "26  The Treaty of Tordesillas, signed in 1494, was an agreement betwee...   \n",
+       "27  When both short-run aggregate supply and aggregate demand increase...   \n",
+       "28  To answer this question, we need to analyze the given text and opt...   \n",
+       "29  To answer this question, we need to consider the historical contri...   \n",
+       "30  To detail new polyomaviruses, we must consider the methods used fo...   \n",
+       "31  To find the square root of 81 squared, we first calculate 81 squar...   \n",
+       "32  To determine which scenario involves the main character doing some...   \n",
+       "33  The Progressive Party Platform of 1912 emphasizes the importance o...   \n",
+       "34  The court is likely to grant the hospital's motion for sanctions b...   \n",
+       "35  To answer this question, we need to understand the concept of gend...   \n",
+       "36  To find the actual population of Louisiana in 2010, we need to con...   \n",
+       "37  The question is about the Doppler effect, which describes how the ...   \n",
+       "38  To solve this, let's break down the components of the sentence and...   \n",
+       "39  Aristotle's theory of the virtues, as outlined in his Nicomachean ...   \n",
+       "\n",
+       "   pred_answer     metric  \n",
+       "0            A  ✔️ [True]  \n",
+       "1            C  ✔️ [True]  \n",
+       "2            A  ✔️ [True]  \n",
+       "3            B  ✔️ [True]  \n",
+       "4            C  ✔️ [True]  \n",
+       "5            B  ✔️ [True]  \n",
+       "6            C  ✔️ [True]  \n",
+       "7            A  ✔️ [True]  \n",
+       "8            D  ✔️ [True]  \n",
+       "9            A  ✔️ [True]  \n",
+       "10           C  ✔️ [True]  \n",
+       "11           D  ✔️ [True]  \n",
+       "12           A  ✔️ [True]  \n",
+       "13           B  ✔️ [True]  \n",
+       "14           C  ✔️ [True]  \n",
+       "15           D  ✔️ [True]  \n",
+       "16           B  ✔️ [True]  \n",
+       "17           A  ✔️ [True]  \n",
+       "18           B             \n",
+       "19           B  ✔️ [True]  \n",
+       "20           C  ✔️ [True]  \n",
+       "21           A  ✔️ [True]  \n",
+       "22           B  ✔️ [True]  \n",
+       "23           C  ✔️ [True]  \n",
+       "24           A  ✔️ [True]  \n",
+       "25           D  ✔️ [True]  \n",
+       "26           D  ✔️ [True]  \n",
+       "27           D  ✔️ [True]  \n",
+       "28           D  ✔️ [True]  \n",
+       "29           A  ✔️ [True]  \n",
+       "30           C  ✔️ [True]  \n",
+       "31           C  ✔️ [True]  \n",
+       "32           C  ✔️ [True]  \n",
+       "33           D  ✔️ [True]  \n",
+       "34           C             \n",
+       "35           A  ✔️ [True]  \n",
+       "36           C  ✔️ [True]  \n",
+       "37           B  ✔️ [True]  \n",
+       "38           C             \n",
+       "39           B  ✔️ [True]  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 64.1 ms, sys: 5.03 ms, total: 69.1 ms\n",
+      "Wall time: 63.5 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "score, results, all_scores = evaluate(\n",
+    "    optimized_program,\n",
+    "    devset=testset[:40],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Medium Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:\n",
+      "num_trials: 25\n",
+      "minibatch: True\n",
+      "num_candidates: 25\n",
+      "valset size: 300\n",
+      "\n",
+      "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==\n",
+      "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used for informing instruction proposal.\n",
+      "\n",
+      "2025/01/29 23:05:39 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=25 sets of demonstrations...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapping set 1/25\n",
+      "Bootstrapping set 2/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:08<35:10,  4.24s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 3/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:00<00:00, 1029.78it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 4/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:09<40:22,  4.86s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 5/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:39<1:49:09, 13.18s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 6/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▍                                                                                                                                                                                                                                               | 1/500 [00:07<1:03:02,  7.58s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
+      "Bootstrapping set 7/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▍                                                                                                                                                                                                                                                 | 1/500 [00:05<46:14,  5.56s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
+      "Bootstrapping set 8/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▍                                                                                                                                                                                                                                                 | 1/500 [00:03<28:32,  3.43s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
+      "Bootstrapping set 9/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|█▍                                                                                                                                                                                                                                                | 3/500 [00:20<56:43,  6.85s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 10/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:25<1:10:33,  8.52s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 11/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:09<38:41,  4.66s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 12/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:11<47:16,  5.70s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 13/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▍                                                                                                                                                                                                                                               | 1/500 [00:08<1:14:39,  8.98s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
+      "Bootstrapping set 14/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:09<39:02,  4.70s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 15/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:11<46:20,  5.58s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 16/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:23<1:04:49,  7.82s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 17/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▍                                                                                                                                                                                                                                                 | 1/500 [00:06<56:44,  6.82s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
+      "Bootstrapping set 18/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|█▍                                                                                                                                                                                                                                                | 3/500 [00:14<40:10,  4.85s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 19/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|█▍                                                                                                                                                                                                                                                | 3/500 [00:19<54:38,  6.60s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 20/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:10<42:03,  5.07s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 21/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|█▉                                                                                                                                                                                                                                              | 4/500 [00:29<1:01:42,  7.46s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.\n",
+      "Bootstrapping set 22/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  1%|█▍                                                                                                                                                                                                                                              | 3/500 [00:22<1:02:47,  7.58s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.\n",
+      "Bootstrapping set 23/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▉                                                                                                                                                                                                                                                 | 2/500 [00:14<58:31,  7.05s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 2 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.\n",
+      "Bootstrapping set 24/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▍                                                                                                                                                                                                                                                 | 1/500 [00:04<39:09,  4.71s/it]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n",
+      "Bootstrapping set 25/25\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  0%|▍                                                                                                                                                                                                                                               | 1/500 [00:10<1:31:02, 10.95s/it]\n",
+      "2025/01/29 23:11:07 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==\n",
+      "2025/01/29 23:11:07 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:11:19 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "Proposing instructions...\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 0: You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 1: To tackle a multiple-choice question effectively, carefully read and analyze the question stem, identifying key concepts and any specific details that might narrow down the possible answers. Next, assess each answer choice in relation to the question, considering the relevance, accuracy, and implications of each option. Provide a step-by-step breakdown of your reasoning process, ensuring that each step logically follows from the previous one and that you address any complexities or nuances of the question. In your explanation, define key terms, explain relevant concepts, and justify why certain options can be eliminated or why one option stands out as the correct answer. Throughout your response, maintain clarity and organization, using transitional phrases and clear headings to guide the reader through your thought process. Finally, conclude by stating the correct answer and summarizing the main points that led to this conclusion, reinforcing the reasoning and ensuring that the explanation is accessible to readers of various expertise levels.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 2: You are a skilled educator and analyst with expertise in multiple disciplines, including law, economics, physics, and general knowledge. Your role is to guide users through complex multiple-choice questions by providing detailed, step-by-step reasoning and explanations. You aim to enhance users' critical thinking skills, decision-making processes, and confidence in tackling assessments across various subjects. When approaching a question, consider the context, analyze each option carefully, and break down the reasoning into clear, structured steps. Adapt your explanations to cater to users with different levels of expertise, ensuring that your responses are informative, engaging, and easy to understand. By doing so, you will not only help users arrive at the correct answers but also foster a deeper understanding of the subject matter and improve their ability to approach similar questions independently.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 3: Act as an expert tutor guiding students through complex multiple-choice questions. Analyze each question carefully, considering all possible answer choices and their implications. Generate a clear, step-by-step explanation for why a particular answer is correct, focusing on breaking down difficult concepts into manageable parts. Ensure your reasoning is concise, yet detailed enough to illuminate the thought process behind selecting the correct answer. Remember, the goal is not just to provide the right answer, but to educate and enhance the student's understanding and critical thinking skills. Approach each question with the mindset of teaching a student how to think through the problem, rather than just giving them the solution. By doing so, you will help foster a deeper understanding of the subject matter and improve the student's ability to tackle similar questions independently.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 4: Analyze the given multiple-choice question and options, then provide a step-by-step reasoning process to arrive at the correct answer. Ensure the explanation is clear, structured, and adapted to the user's level of expertise, fostering deeper comprehension and confidence in tackling multiple-choice assessments.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 5: To tackle multiple-choice questions effectively, analyze the question stem to identify key concepts and issues. Then, evaluate each answer choice by considering its relevance, logical consistency, and alignment with the question's requirements. Break down complex concepts into manageable parts, and assess how each option addresses the question's core concerns. Provide a step-by-step reasoning process that justifies the selection of the correct answer, ensuring that the explanation is clear, concise, and tailored to the user's level of expertise. Ultimately, aim to enhance the user's critical thinking skills and confidence in approaching multiple-choice assessments by offering insightful guidance and fostering a deeper understanding of the subject matter.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 6: To provide a well-structured response, analyze the given multiple-choice question and options. Break down the question into key components, and then assess each option against these components. Offer a step-by-step reasoning process that leads to the selection of the correct answer. Ensure that the reasoning is clear, concise, and adapted to the user's level of expertise. The goal is to not only select the correct answer but to also explain why it is correct and why the other options are incorrect, thereby enhancing the user's understanding and critical thinking skills.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 7: Act as an expert tutor guiding students through complex multiple-choice questions. When presented with a question and a set of options, carefully analyze the query, evaluate each answer choice, and construct a detailed, step-by-step explanation that leads to the correct answer. Ensure your reasoning is clear, concise, and tailored to the user's level of understanding, promoting a deeper grasp of the subject matter and enhancing their ability to approach similar questions with confidence. Your explanation should not only justify the correct answer but also provide insight into why the other options are incorrect, thereby fostering critical thinking and analytical skills.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 8: You are a meticulous and analytical expert in multiple disciplines, including law, entertainment, and science, tasked with providing detailed explanations for multiple-choice questions. Your role involves carefully reading each question, evaluating the provided options, and selecting the most appropriate answer based on your knowledge and reasoning skills. For each question, you will provide a step-by-step breakdown of your thought process, explaining why you chose a particular answer and why the other options are incorrect. Your goal is to not only provide the correct answer but also to educate and guide users through the reasoning process, helping them develop their critical thinking skills and improve their ability to tackle complex multiple-choice assessments.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 9: To effectively utilize the language model for solving multiple-choice questions, provide the model with a clear and structured input that includes the question, the available options, and any relevant context or information. The model should then analyze the question, assess each option, and generate a step-by-step reasoning process to arrive at the correct answer. This process should be transparent, with the model explaining its thought process and the rationale behind its choice. The output should include the correct answer, along with a detailed explanation of how the model deduced this answer from the given options. The goal is to not only provide the correct answer but also to educate the user on the reasoning and critical thinking skills required to approach such questions. The model should adapt its explanations based on the user's level of expertise, ensuring that the information is presented in a clear and understandable manner. By doing so, the model will help users develop their critical thinking and analytical skills, enhancing their ability to tackle complex multiple-choice questions across various subjects.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 10: You are a skilled analyst and educator with expertise in critical thinking, analytical reasoning, and multiple-choice question strategy. Your role involves guiding users through complex questions, evaluating answer options, and providing detailed, step-by-step explanations to facilitate deeper understanding and improved decision-making skills. By adapting your approach to the user's level of expertise, you aim to enhance their ability to analyze information, identify relevant details, and select the most appropriate answer. When presented with a question, carefully consider the context, assess each option, and generate a well-reasoned explanation for the correct answer, ensuring that your response is clear, structured, and informative.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 11: To utilize the Predict module effectively, provide a clear and concise multiple-choice question along with the available options. Ensure the question is well-structured and relevant to the topic at hand, whether it pertains to legal, psychological, or any other domain. The Predict module is designed to analyze the question, evaluate the provided options, and generate a detailed, step-by-step reasoning process to arrive at the most appropriate answer. This process involves breaking down complex concepts into manageable parts, assessing the relevance and probative value of information, and applying principles from the respective field of study to support the conclusion. The goal is to not only select the correct answer but to foster a deeper understanding of the subject matter, enhance critical thinking skills, and promote confidence in tackling similar questions in the future. When formulating your question, consider the context, the principles or rules that apply, and how the options provided relate to the question's requirements. The Predict module will then use this information to produce a comprehensive explanation, making it an invaluable tool for educational purposes, professional development, and the refinement of analytical reasoning skills.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 12: You are a critical component of a high-stakes decision-making system, and your task is to analyze complex multiple-choice questions, assess the provided answer choices, and generate step-by-step reasoning to arrive at the correct answer. The questions span various disciplines, including economics, law, physics, and general knowledge, requiring you to be versatile and knowledgeable. Your explanations must be clear, structured, and adapted to the user's level of expertise, as the consequences of incorrect decisions could be significant. For instance, in a real-world scenario, a incorrect answer could lead to financial losses, legal repercussions, or even physical harm. Your goal is to provide accurate and well-reasoned answers, fostering deeper comprehension and confidence in users as they tackle challenging assessments. You must evaluate each question carefully, considering the context, the subject matter, and the implications of each possible answer choice. By doing so, you will help users develop critical thinking skills, improve their decision-making process, and ultimately make informed decisions in high-pressure situations.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 13: You are a skilled educator and subject matter expert, proficient in a wide range of disciplines, including economics, law, physics, and general knowledge. Your role is to analyze complex multiple-choice questions, evaluate the provided options, and generate detailed, step-by-step explanations to justify the correct answer. By doing so, you aim to enhance the user's understanding, foster critical thinking, and improve their ability to approach similar questions with confidence. When responding, please provide a clear and structured reasoning process, adapting your explanation to the user's level of expertise, and ensure that your answer includes the correct choice (A, B, C, D, etc.) along with a thorough justification for why it is the correct option.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 14: You are a skilled educator with expertise in a wide range of subjects, including economics, law, physics, and general knowledge. Your role is to guide students through multiple-choice questions by providing detailed, step-by-step explanations that enhance their understanding and critical thinking skills. When presented with a question and a set of answer options, analyze the question carefully, assess each option, and then offer a well-reasoned explanation for your answer choice. Ensure that your explanations are clear, structured, and adapted to the student's level of expertise. Your goal is to not only help students arrive at the correct answer but also to foster deeper comprehension and confidence in tackling multiple-choice assessments across various disciplines.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 15: To provide accurate and helpful responses to multiple-choice questions, analyze the question carefully, considering the subject matter and the specific concepts being tested. Evaluate each answer choice based on its relevance, accuracy, and alignment with the question's requirements. Offer step-by-step reasoning to justify the correct answer, breaking down complex concepts into clear and understandable parts. Ensure that the explanation is structured, easy to follow, and adapted to the user's level of expertise. The goal is to not only provide the correct answer but to also enhance the user's understanding of the subject matter, foster critical thinking skills, and build confidence in tackling similar questions in the future.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 16: To tackle multiple-choice questions effectively, I will employ a systematic approach that involves thoroughly analyzing the question, carefully evaluating each answer choice, and providing detailed, step-by-step reasoning to justify the correct answer. This process will not only help in selecting the right option but also in understanding the underlying concepts and principles. The goal is to break down complex questions into manageable parts, assess each component critically, and synthesize the information to arrive at a well-reasoned conclusion. By doing so, the aim is to enhance comprehension, foster critical thinking, and build confidence in addressing multiple-choice assessments across various disciplines.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 17: Act as an expert tutor guiding students through complex multiple-choice questions, providing detailed step-by-step reasoning for each answer choice. Your goal is to not only help students select the correct answer but also to enhance their understanding of the subject matter by breaking down intricate concepts into clear, manageable parts. As you analyze each question, consider the various disciplines and fields of study it may touch upon, such as economics, law, physics, or general knowledge, and tailor your explanations accordingly to foster a deeper and more interdisciplinary understanding. Ensure your reasoning is structured, easy to follow, and adapted to the student's level of expertise, promoting critical thinking, analytical reasoning, and confidence in tackling a wide range of multiple-choice assessments.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 18: You are a critical member of an elite task force responsible for analyzing complex multiple-choice questions and providing accurate, well-reasoned explanations to support your answers. Your team has been entrusted with a high-stakes mission to tackle a series of challenging assessments, and your performance will directly impact the success of the operation. Each question you encounter requires meticulous analysis, careful consideration of the answer choices, and the ability to articulate clear, step-by-step reasoning to justify your conclusions. Your goal is to demonstrate exceptional critical thinking skills, adapt to diverse subjects and question formats, and consistently deliver accurate, confidence-inspiring answers. The fate of the mission rests on your ability to think critically, solve problems effectively, and communicate complex ideas with clarity and precision. You must analyze the question, evaluate the options, and provide a well-reasoned explanation for your answer choice, ensuring that your response is concise, informative, and free of ambiguity.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 19: To address the task effectively, I propose the following instruction: \n",
+      "\n",
+      "\"Analyze the given question and options carefully, considering the context and any specific details provided. Break down complex concepts into manageable parts to assess each option's validity. Provide a step-by-step reasoning process to arrive at the correct answer, ensuring that the explanation is clear, structured, and adaptable to the user's level of expertise. The goal is to not only select the correct answer but also to enhance the user's understanding and critical thinking skills through a well-reasoned and detailed explanation.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 20: Act as an expert tutor guiding students through complex multiple-choice questions across various subjects, including economics, law, physics, and general knowledge. Your task is to analyze the question, evaluate the provided options, and generate a step-by-step reasoning process that leads to the correct answer. Ensure your explanations are clear, structured, and adaptable to different levels of expertise, aiming to enhance critical thinking, analytical reasoning, and decision-making skills. When approaching a question, consider the context, identify key concepts, and apply relevant principles or theories to deduce the most appropriate answer. Your goal is to not only provide the correct answer but also to educate and foster a deeper understanding of the subject matter, enabling students to tackle similar questions with confidence and accuracy.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 21: You are a critical component of an emergency response system designed to provide accurate and timely guidance on complex, high-stakes multiple-choice questions. In this scenario, a team of experts is racing against time to solve a series of critical problems, and their decisions will have far-reaching consequences. Your role is to analyze each question, assess the answer choices, and provide well-reasoned explanations to support the correct answer. The experts are relying on your guidance to make informed decisions, and the outcome of their efforts depends on the accuracy and clarity of your responses. You must break down complex concepts into clear, structured explanations, adapting your approach to the experts' level of expertise. Your goal is to empower the team to make confident, informed decisions, even in the most pressure-filled situations. You have the ability to ask for clarification or additional information if needed, but you must provide a final answer and explanation within a tight deadline. The fate of the mission rests on your ability to provide accurate and reliable guidance, so you must be meticulous, thorough, and clear in your analysis and explanations.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 22: To address the task effectively, I will analyze the provided question and options, breaking down complex concepts into clear, step-by-step reasoning. My goal is to select the correct answer and provide a detailed explanation that enhances understanding and fosters critical thinking skills. I will adapt my response to the user's level of expertise, ensuring the information is presented in a structured and accessible manner. For each question, I will evaluate the options based on relevant knowledge and logical deductions, clearly outlining my thought process to facilitate learning and confidence in tackling multiple-choice assessments.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 23: Analyze the given multiple-choice question and options, then provide a step-by-step reasoning process to determine the correct answer. Ensure the explanation is clear, structured, and adapted to the user's level of expertise, focusing on fostering deeper comprehension and confidence in tackling multiple-choice assessments.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: 24: You are a legal expert and educator with extensive experience in evidence law and trial procedures. Your role is to guide students and professionals through complex multiple-choice questions related to legal evidence, ensuring they understand the underlying principles and can apply them to real-world scenarios. Given a question and a set of options, analyze the scenario carefully, considering the relevance, reliability, and potential prejudice of the evidence in question. Then, provide a step-by-step reasoning process that leads to the correct answer, explaining the legal concepts and rules of evidence that apply. Your explanations should be clear, concise, and tailored to the user's level of expertise, aiming to enhance their critical thinking and decision-making skills in the context of legal evidence and trial procedures.\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Evaluating the default program...\n",
+      "\n",
+      "2025/01/29 23:15:54 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Compute the area of the triangle whose altitudes have lengths 10, 12, and 15.', 'options': {'A': '56\\\\sqrt{15}', 'B': '240\\\\sqrt{7}', 'C': '60\\\\sqrt{7}', 'D': '240\\\\sqrt{7}/7'}, 'answer': 'D', 'reasoning': \"## Step 1: Recall the relationship between the area of a triangle and its altitudes\\nThe area \\\\(A\\\\) of a triangle can be found using the formula \\\\(A = \\\\frac{1}{2}bh\\\\), where \\\\(b\\\\) is the base of the triangle and \\\\(h\\\\) is the height (or altitude) of the triangle. However, when all three altitudes are known, we can use the formula for the area involving the altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and the sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) of the triangle, given by \\\\(A = \\\\frac{1}{2}h_1a = \\\\frac{1}{2}h_2b = \\\\frac{1}{2}h_3c\\\\), but a more direct approach involves using Heron's formula in conjunction with the relationship between the altitudes and the sides.\\n\\n## Step 2: Use the formula that relates the area of a triangle directly to its altitudes\\nThere's a formula that relates the area \\\\(A\\\\) of a triangle directly to its altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), which is \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) for certain types of triangles, but this isn't universally applicable. A more universally applicable method involves understanding that the area can also be expressed in terms of its inradius \\\\(r\\\\) and semiperimeter \\\\(s\\\\), \\\\(A = rs\\\\), and the relationship between the altitudes and the sides can give us \\\\(r\\\\) through \\\\(r = \\\\frac{A}{s}\\\\) and the formula \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), but we need a direct link between altitudes and area.\\n\\n## Step 3: Correct Approach - Use the relationship between the altitudes and the area through the inradius\\nFor any triangle with altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and corresponding sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\), the area \\\\(A\\\\) can also be expressed as \\\\(A = \\\\frac{1}{2} \\\\cdot a \\\\cdot h_1 = \\\\frac{1}{2} \\\\cdot b \\\\cdot h_2 = \\\\frac{1}{2} \\\\cdot c \\\\cdot h_3\\\\). However, to directly compute the area from the altitudes without knowing the sides, we consider the formula \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) is not standard; instead, we should focus on the relationship \\\\(A = rs\\\\) where \\\\(r\\\\) is the inradius and \\\\(s\\\\) the semiperimeter. The inradius can be found from the altitudes and the sides through the formula for the area, but without the sides, we look for a relationship that directly ties altitudes to area.\\n\\n## Step 4: Apply the correct formula for the area using altitudes\\nGiven the altitudes \\\\(h_1 = 10\\\\), \\\\(h_2 = 12\\\\), and \\\\(h_3 = 15\\\\), we need to apply a correct and relevant formula that ties these directly to the area. However, the direct calculation of the area from altitudes without the sides or angles involves understanding that \\\\(h_1 = \\\\frac{2A}{a}\\\\), \\\\(h_2 = \\\\frac{2A}{b}\\\\), and \\\\(h_3 = \\\\frac{2A}{c}\\\\), which implies \\\\(A = \\\\frac{1}{2}ah_1 = \\\\frac{1}{2}bh_2 = \\\\frac{1}{2}ch_3\\\\). To find \\\\(A\\\\) directly from \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), consider the relationship of the altitudes to the area and the sides, but note that a direct formula might not be straightforward without additional relationships.\\n\\n## Step 5: Realize the necessity of a different approach\\nGiven the complexity and the potential for confusion in directly applying a formula without clear relevance, reconsider the basic principles of geometry and how they apply to triangles, specifically how altitudes relate to the area and the sides. The area of a triangle given its altitudes directly might not be straightforward without considering the properties of right triangles or the use of Heron's formula in a creative way.\\n\\n## Step 6: Reflect on Heron's Formula and its applicability\\nHeron's formula states that the area \\\\(A\\\\) of a triangle whose sides have lengths \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) is \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), where \\\\(s\\\\) is the semiperimeter of the triangle, \\\\(s = \\\\frac{a + b + c}{2}\\\\). However, without the sides, directly applying Heron's formula is not possible.\\n\\n## Step 7: Consider the relationship between altitudes, area, and the inradius\\nThe area \\\\(A\\\\) of a triangle is also equal to \\\\(rs\\\\), where \\\\(r\\\\) is the inradius and \\\\(s\\\\) is the semiperimeter. The inradius can be related to the altitudes and the area, but this requires knowledge of the sides or a relationship that ties the altitudes directly to the inradius and semiperimeter.\\n\\n## Step 8: Final Calculation Approach\\nGiven the altitudes 10, 12, and 15, and recognizing the limitations of the previous steps in directly calculating the area without additional information or a clear, direct formula, we should reconsider the geometric properties that relate altitudes to the area and sides of a triangle. However, a common approach to solve this problem involves using the formula for the area based on the altitudes and the sides they intersect, but this typically requires knowledge of the triangle's sides or angles.\\n\\nThe best answer is D.\"}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 271.00 / 299 (90.6%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:00<00:00, 2574.39it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:15:54 INFO dspy.evaluate.evaluate: Average Metric: 271.0 / 300 (90.3%)\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 90.33\n",
+      "\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: ==> STEP 3: FINDING OPTIMAL PROMPT PARAMETERS <==\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: We will evaluate the program over a series of trials with different combinations of instructions and few-shot examples to find the optimal combination using Bayesian Optimization.\n",
+      "\n",
+      "/home/justinai/.conda/envs/prompt-migration/lib/python3.10/site-packages/optuna/_experimental.py:31: ExperimentalWarning: Argument ``multivariate`` is an experimental feature. The interface can change in the future.\n",
+      "  warnings.warn(\n",
+      "2025/01/29 23:15:54 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 1 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.88it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:16:07 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
+      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 12'].\n",
+      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0]\n",
+      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:16:07 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 2 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 24.00 / 25 (96.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:16<00:00,  1.49it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:16:24 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n",
+      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
+      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0]\n",
+      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:16:24 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 3 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 25.00 / 25 (100.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.90it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:16:38 INFO dspy.evaluate.evaluate: Average Metric: 25 / 25 (100.0%)\n",
+      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 100.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14'].\n",
+      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0]\n",
+      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:16:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 4 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.91it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:16:51 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 8'].\n",
+      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0]\n",
+      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:16:51 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 5 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:19<00:00,  1.29it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:17:10 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 13'].\n",
+      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0]\n",
+      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:17:10 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 6 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:17<00:00,  1.46it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:17:27 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
+      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 4'].\n",
+      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0]\n",
+      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:17:27 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 7 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.69it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:17:42 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
+      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10'].\n",
+      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0]\n",
+      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:17:42 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 8 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.74it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:17:57 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
+      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14'].\n",
+      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0]\n",
+      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:17:57 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 9 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:42<00:00,  1.68s/it]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:18:39 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
+      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10'].\n",
+      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0]\n",
+      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: ============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:18:39 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 10 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.72it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:18:54 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 10'].\n",
+      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0]\n",
+      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33]\n",
+      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 1 =====\n",
+      "2025/01/29 23:18:54 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 96.0) from minibatch trials...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 205.00 / 230 (89.1%):  77%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                               | 230/300 [01:07<00:15,  4.47it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:20:01 ERROR dspy.utils.parallelizer: Error processing item Example({'question': 'Compute the area of the triangle whose altitudes have lengths 10, 12, and 15.', 'options': {'A': '56\\\\sqrt{15}', 'B': '240\\\\sqrt{7}', 'C': '60\\\\sqrt{7}', 'D': '240\\\\sqrt{7}/7'}, 'answer': 'D', 'reasoning': \"## Step 1: Recall the relationship between the area of a triangle and its altitudes\\nThe area \\\\(A\\\\) of a triangle can be found using the formula \\\\(A = \\\\frac{1}{2}bh\\\\), where \\\\(b\\\\) is the base of the triangle and \\\\(h\\\\) is the height (or altitude) of the triangle. However, when all three altitudes are known, we can use the formula for the area involving the altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and the sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) of the triangle, given by \\\\(A = \\\\frac{1}{2}h_1a = \\\\frac{1}{2}h_2b = \\\\frac{1}{2}h_3c\\\\), but a more direct approach involves using Heron's formula in conjunction with the relationship between the altitudes and the sides.\\n\\n## Step 2: Use the formula that relates the area of a triangle directly to its altitudes\\nThere's a formula that relates the area \\\\(A\\\\) of a triangle directly to its altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), which is \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) for certain types of triangles, but this isn't universally applicable. A more universally applicable method involves understanding that the area can also be expressed in terms of its inradius \\\\(r\\\\) and semiperimeter \\\\(s\\\\), \\\\(A = rs\\\\), and the relationship between the altitudes and the sides can give us \\\\(r\\\\) through \\\\(r = \\\\frac{A}{s}\\\\) and the formula \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), but we need a direct link between altitudes and area.\\n\\n## Step 3: Correct Approach - Use the relationship between the altitudes and the area through the inradius\\nFor any triangle with altitudes \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), and corresponding sides \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\), the area \\\\(A\\\\) can also be expressed as \\\\(A = \\\\frac{1}{2} \\\\cdot a \\\\cdot h_1 = \\\\frac{1}{2} \\\\cdot b \\\\cdot h_2 = \\\\frac{1}{2} \\\\cdot c \\\\cdot h_3\\\\). However, to directly compute the area from the altitudes without knowing the sides, we consider the formula \\\\(A = \\\\sqrt{\\\\frac{h_1h_2h_3}{2}(h_1 + h_2 + h_3)}\\\\) is not standard; instead, we should focus on the relationship \\\\(A = rs\\\\) where \\\\(r\\\\) is the inradius and \\\\(s\\\\) the semiperimeter. The inradius can be found from the altitudes and the sides through the formula for the area, but without the sides, we look for a relationship that directly ties altitudes to area.\\n\\n## Step 4: Apply the correct formula for the area using altitudes\\nGiven the altitudes \\\\(h_1 = 10\\\\), \\\\(h_2 = 12\\\\), and \\\\(h_3 = 15\\\\), we need to apply a correct and relevant formula that ties these directly to the area. However, the direct calculation of the area from altitudes without the sides or angles involves understanding that \\\\(h_1 = \\\\frac{2A}{a}\\\\), \\\\(h_2 = \\\\frac{2A}{b}\\\\), and \\\\(h_3 = \\\\frac{2A}{c}\\\\), which implies \\\\(A = \\\\frac{1}{2}ah_1 = \\\\frac{1}{2}bh_2 = \\\\frac{1}{2}ch_3\\\\). To find \\\\(A\\\\) directly from \\\\(h_1\\\\), \\\\(h_2\\\\), and \\\\(h_3\\\\), consider the relationship of the altitudes to the area and the sides, but note that a direct formula might not be straightforward without additional relationships.\\n\\n## Step 5: Realize the necessity of a different approach\\nGiven the complexity and the potential for confusion in directly applying a formula without clear relevance, reconsider the basic principles of geometry and how they apply to triangles, specifically how altitudes relate to the area and the sides. The area of a triangle given its altitudes directly might not be straightforward without considering the properties of right triangles or the use of Heron's formula in a creative way.\\n\\n## Step 6: Reflect on Heron's Formula and its applicability\\nHeron's formula states that the area \\\\(A\\\\) of a triangle whose sides have lengths \\\\(a\\\\), \\\\(b\\\\), and \\\\(c\\\\) is \\\\(A = \\\\sqrt{s(s-a)(s-b)(s-c)}\\\\), where \\\\(s\\\\) is the semiperimeter of the triangle, \\\\(s = \\\\frac{a + b + c}{2}\\\\). However, without the sides, directly applying Heron's formula is not possible.\\n\\n## Step 7: Consider the relationship between altitudes, area, and the inradius\\nThe area \\\\(A\\\\) of a triangle is also equal to \\\\(rs\\\\), where \\\\(r\\\\) is the inradius and \\\\(s\\\\) is the semiperimeter. The inradius can be related to the altitudes and the area, but this requires knowledge of the sides or a relationship that ties the altitudes directly to the inradius and semiperimeter.\\n\\n## Step 8: Final Calculation Approach\\nGiven the altitudes 10, 12, and 15, and recognizing the limitations of the previous steps in directly calculating the area without additional information or a clear, direct formula, we should reconsider the geometric properties that relate altitudes to the area and sides of a triangle. However, a common approach to solve this problem involves using the formula for the area based on the altitudes and the sides they intersect, but this typically requires knowledge of the triangle's sides or angles.\\n\\nThe best answer is D.\"}) (input_keys={'options', 'question'}): Expected dict_keys(['reasoning', 'answer']) but got dict_keys(['reasoning']). Set `provide_traceback=True` to see the stack trace.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 264.00 / 299 (88.3%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:55<00:00,  2.59it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:20:50 INFO dspy.evaluate.evaluate: Average Metric: 264.0 / 300 (88.0%)\n",
+      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
+      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "\n",
+      "2025/01/29 23:20:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 11 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 24.00 / 25 (96.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:12<00:00,  1.94it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:21:03 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 23'].\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0]\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 12 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 24.00 / 25 (96.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2624.00it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:21:03 INFO dspy.evaluate.evaluate: Average Metric: 24 / 25 (96.0%)\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 96.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0]\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:21:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 13 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:15<00:00,  1.63it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:21:18 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
+      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 2'].\n",
+      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0]\n",
+      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:21:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 14 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 23.00 / 25 (92.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:12<00:00,  1.97it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:21:31 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
+      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 14'].\n",
+      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0]\n",
+      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:21:31 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 15 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:13<00:00,  1.86it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:21:44 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 9'].\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0]\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 16 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2597.48it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:21:44 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 1'].\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0]\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:21:44 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 17 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 21.00 / 25 (84.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:41<00:00,  1.67s/it]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:22:26 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
+      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 11'].\n",
+      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0]\n",
+      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:22:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 18 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:14<00:00,  1.72it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:22:41 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 5'].\n",
+      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0]\n",
+      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:22:41 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 19 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 20.00 / 25 (80.0%): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:18<00:00,  1.36it/s]                                                      "
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:22:59 INFO dspy.evaluate.evaluate: Average Metric: 20 / 25 (80.0%)\n",
+      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 80.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 17'].\n",
+      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0]\n",
+      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:22:59 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 20 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:42<00:00,  1.72s/it]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:23:42 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 22'].\n",
+      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0]\n",
+      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0]\n",
+      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 2 =====\n",
+      "2025/01/29 23:23:42 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 96.0) from minibatch trials...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 262.00 / 300 (87.3%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:29<00:00,  3.36it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:25:12 INFO dspy.evaluate.evaluate: Average Metric: 262 / 300 (87.3%)\n",
+      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
+      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
+      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "\n",
+      "2025/01/29 23:25:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 21 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 21.00 / 25 (84.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:40<00:00,  1.61s/it]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:25:52 INFO dspy.evaluate.evaluate: Average Metric: 21 / 25 (84.0%)\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 84.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 20'].\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0]\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 22 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2398.33it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:25:52 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 23'].\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0]\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:25:52 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 23 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 23.00 / 25 (92.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:39<00:00,  1.56s/it]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:26:32 INFO dspy.evaluate.evaluate: Average Metric: 23 / 25 (92.0%)\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 92.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 24'].\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0, 92.0]\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 24 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 2904.08it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:26:32 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 0'].\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0, 92.0, 88.0]\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:26:32 INFO dspy.teleprompt.mipro_optimizer_v2: == Minibatch Trial 25 / 25 ==\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 22.00 / 25 (88.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25/25 [00:43<00:00,  1.74s/it]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:27:15 INFO dspy.evaluate.evaluate: Average Metric: 22 / 25 (88.0%)\n",
+      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 88.0 on minibatch of size 25 with parameters ['Predictor 0: Instruction 7'].\n",
+      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [92.0, 96.0, 100.0, 88.0, 88.0, 92.0, 84.0, 84.0, 92.0, 88.0, 96.0, 96.0, 84.0, 92.0, 88.0, 88.0, 84.0, 88.0, 80.0, 88.0, 84.0, 88.0, 92.0, 88.0, 88.0]\n",
+      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33]\n",
+      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: =============================\n",
+      "\n",
+      "\n",
+      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Full Eval 3 =====\n",
+      "2025/01/29 23:27:15 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 92.0) from minibatch trials...\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Average Metric: 267.00 / 300 (89.0%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:30<00:00,  3.30it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:28:46 INFO dspy.evaluate.evaluate: Average Metric: 267 / 300 (89.0%)\n",
+      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [90.33, 88.0, 87.33, 89.0]\n",
+      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 90.33\n",
+      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: =======================\n",
+      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: \n",
+      "\n",
+      "2025/01/29 23:28:46 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 90.33!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "CPU times: user 18.3 s, sys: 3.02 s, total: 21.3 s\n",
+      "Wall time: 23min 6s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "subset_size = 500\n",
+    "optimizer = dspy.MIPROv2(\n",
+    "    metric=benchmark.metric,\n",
+    "    auto=\"medium\",\n",
+    "    num_threads=NUM_THREADS,\n",
+    "    task_model=TASK_MODEL,\n",
+    "    prompt_model=PROMPT_MODEL,\n",
+    "    max_bootstrapped_demos=0,\n",
+    "    max_labeled_demos=FEW_SHOTS,\n",
+    ")\n",
+    "\n",
+    "optimized_program = optimizer.compile(\n",
+    "    program,\n",
+    "    trainset=trainset[:subset_size],\n",
+    "    valset=valset[:subset_size],\n",
+    "    requires_permission_to_run=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BEST PROMPT:\n",
+      " You are a knowledgeable and insightful assistant designed to provide guidance on multiple-choice questions. Your role involves analyzing questions, assessing answer choices, and offering well-reasoned explanations to enhance understanding. By breaking down complex concepts, you help users develop critical thinking skills and improve their decision-making process. You strive to present information in a clear, structured manner while adapting to the user's level of expertise. Ultimately, your goal is to foster deeper comprehension and confidence in tackling multiple-choice assessments.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"BEST PROMPT:\\n\", optimized_program.signature.instructions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BEST EXAMPLES:\n",
+      " []\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "AttributeError",
+     "evalue": "'ChainOfThought' object has no attribute 'predict'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[48], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m examples_json \u001b[38;5;241m=\u001b[39m [example\u001b[38;5;241m.\u001b[39mto_dict() \u001b[38;5;28;01mfor\u001b[39;00m example \u001b[38;5;129;01min\u001b[39;00m \u001b[43moptimized_program\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[38;5;241m.\u001b[39mdemos]\n\u001b[1;32m      2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBEST EXAMPLES:\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, json\u001b[38;5;241m.\u001b[39mdumps(examples_json, indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m))\n",
+      "\u001b[0;31mAttributeError\u001b[0m: 'ChainOfThought' object has no attribute 'predict'"
+     ]
+    }
+   ],
+   "source": [
+    "examples_json = [example.to_dict() for example in optimized_program.predict.demos]\n",
+    "print(\"BEST EXAMPLES:\\n\", json.dumps(examples_json, indent=2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Average Metric: 453.00 / 500 (90.6%): 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [01:42<00:00,  4.90it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2025/01/29 23:32:56 INFO dspy.evaluate.evaluate: Average Metric: 453 / 500 (90.6%)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>question</th>\n",
+       "      <th>options</th>\n",
+       "      <th>example_answer</th>\n",
+       "      <th>example_reasoning</th>\n",
+       "      <th>pred_reasoning</th>\n",
+       "      <th>pred_answer</th>\n",
+       "      <th>metric</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>For which of these two scenarios does the main character (who uses...</td>\n",
+       "      <td>{'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>## Step 1: Evaluate the first scenario In the first scenario, the ...</td>\n",
+       "      <td>To determine which scenario involves the main character doing some...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Two lists, list1 and list2, contain the names of books found in tw...</td>\n",
+       "      <td>{'A': 'newList ← Combine (listl, list2)\\n newList ← Sort (newList)...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>To create newList, which contains the names of all books found in ...</td>\n",
+       "      <td>To create newList, which contains the names of all books found in ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>How might the recent global economic crisis be viewed as a challen...</td>\n",
+       "      <td>{'A': 'Governmental cyberspace restrictions, in the form of censor...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>## Step 1: Understanding the liberalist perspective The liberalist...</td>\n",
+       "      <td>The liberalist perspective emphasizes the importance of free marke...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>What term is used to represent unavoidable past costs that cannot ...</td>\n",
+       "      <td>{'A': 'Prime costs', 'B': 'Sunk costs', 'C': 'Opportunity costs', ...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>## Step 1: Understanding the concept of sunk costs Sunk costs are ...</td>\n",
+       "      <td>To answer this question, we need to understand the concept of each...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Markson Co. traded a concrete-mixing truck with a book value of $1...</td>\n",
+       "      <td>{'A': 'Does the book value of the asset given up exceed the fair v...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>## Step 1: Understand the concept of commercial substance in asset...</td>\n",
+       "      <td>To determine whether an exchange of assets has commercial substanc...</td>\n",
+       "      <td>C</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>495</th>\n",
+       "      <td>A man is charged with murder. During the trial, defense counsel of...</td>\n",
+       "      <td>{'A': 'not hearsay.', 'B': 'hearsay, but admissible as an admissio...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>To answer this question, let's break it down step by step: ## Step...</td>\n",
+       "      <td>The testimony in question involves a statement made by a man on de...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>496</th>\n",
+       "      <td>Two men held-up a liquor store in a city. During the robbery, one ...</td>\n",
+       "      <td>{'A': 'granted, because the prosecutor is constitutionally require...</td>\n",
+       "      <td>B</td>\n",
+       "      <td>To answer this question, we need to consider the legal implication...</td>\n",
+       "      <td>The defendant's motion to dismiss the indictment due to the delay ...</td>\n",
+       "      <td>C</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>497</th>\n",
+       "      <td>Which vitamins are important in lowering circulating homocysteine ...</td>\n",
+       "      <td>{'A': 'Vitamin D', 'B': 'Vitamin C', 'C': 'Vitamin A', 'D': 'Folat...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>## Step 1: Understanding the role of vitamins in homocysteine leve...</td>\n",
+       "      <td>To answer this question, we need to consider the role of vitamins ...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>498</th>\n",
+       "      <td>This question refers to the following information. \"The greatest c...</td>\n",
+       "      <td>{'A': 'African nations will not achieve independence without unity...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>## Step 1: Understand the context of Nkrumah's statement Nkrumah e...</td>\n",
+       "      <td>To answer this question, we need to understand the context and the...</td>\n",
+       "      <td>D</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>499</th>\n",
+       "      <td>Millions of immigrant children who entered the United States learn...</td>\n",
+       "      <td>{'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>## Step 1: Understanding the concept of acculturation Acculturatio...</td>\n",
+       "      <td>The concept described in the question involves immigrant children ...</td>\n",
+       "      <td>A</td>\n",
+       "      <td>✔️ [True]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>500 rows × 7 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                  question  \\\n",
+       "0    For which of these two scenarios does the main character (who uses...   \n",
+       "1    Two lists, list1 and list2, contain the names of books found in tw...   \n",
+       "2    How might the recent global economic crisis be viewed as a challen...   \n",
+       "3    What term is used to represent unavoidable past costs that cannot ...   \n",
+       "4    Markson Co. traded a concrete-mixing truck with a book value of $1...   \n",
+       "..                                                                     ...   \n",
+       "495  A man is charged with murder. During the trial, defense counsel of...   \n",
+       "496  Two men held-up a liquor store in a city. During the robbery, one ...   \n",
+       "497  Which vitamins are important in lowering circulating homocysteine ...   \n",
+       "498  This question refers to the following information. \"The greatest c...   \n",
+       "499  Millions of immigrant children who entered the United States learn...   \n",
+       "\n",
+       "                                                                   options  \\\n",
+       "0    {'A': 'Wrong, Wrong', 'B': 'Wrong, Not wrong', 'C': 'Not wrong, Wr...   \n",
+       "1    {'A': 'newList ← Combine (listl, list2)\\n newList ← Sort (newList)...   \n",
+       "2    {'A': 'Governmental cyberspace restrictions, in the form of censor...   \n",
+       "3    {'A': 'Prime costs', 'B': 'Sunk costs', 'C': 'Opportunity costs', ...   \n",
+       "4    {'A': 'Does the book value of the asset given up exceed the fair v...   \n",
+       "..                                                                     ...   \n",
+       "495  {'A': 'not hearsay.', 'B': 'hearsay, but admissible as an admissio...   \n",
+       "496  {'A': 'granted, because the prosecutor is constitutionally require...   \n",
+       "497  {'A': 'Vitamin D', 'B': 'Vitamin C', 'C': 'Vitamin A', 'D': 'Folat...   \n",
+       "498  {'A': 'African nations will not achieve independence without unity...   \n",
+       "499  {'A': 'Acculturation', 'B': 'Collective behavior', 'C': 'Social st...   \n",
+       "\n",
+       "    example_answer  \\\n",
+       "0                A   \n",
+       "1                A   \n",
+       "2                C   \n",
+       "3                B   \n",
+       "4                C   \n",
+       "..             ...   \n",
+       "495              D   \n",
+       "496              B   \n",
+       "497              D   \n",
+       "498              D   \n",
+       "499              A   \n",
+       "\n",
+       "                                                         example_reasoning  \\\n",
+       "0    ## Step 1: Evaluate the first scenario In the first scenario, the ...   \n",
+       "1    To create newList, which contains the names of all books found in ...   \n",
+       "2    ## Step 1: Understanding the liberalist perspective The liberalist...   \n",
+       "3    ## Step 1: Understanding the concept of sunk costs Sunk costs are ...   \n",
+       "4    ## Step 1: Understand the concept of commercial substance in asset...   \n",
+       "..                                                                     ...   \n",
+       "495  To answer this question, let's break it down step by step: ## Step...   \n",
+       "496  To answer this question, we need to consider the legal implication...   \n",
+       "497  ## Step 1: Understanding the role of vitamins in homocysteine leve...   \n",
+       "498  ## Step 1: Understand the context of Nkrumah's statement Nkrumah e...   \n",
+       "499  ## Step 1: Understanding the concept of acculturation Acculturatio...   \n",
+       "\n",
+       "                                                            pred_reasoning  \\\n",
+       "0    To determine which scenario involves the main character doing some...   \n",
+       "1    To create newList, which contains the names of all books found in ...   \n",
+       "2    The liberalist perspective emphasizes the importance of free marke...   \n",
+       "3    To answer this question, we need to understand the concept of each...   \n",
+       "4    To determine whether an exchange of assets has commercial substanc...   \n",
+       "..                                                                     ...   \n",
+       "495  The testimony in question involves a statement made by a man on de...   \n",
+       "496  The defendant's motion to dismiss the indictment due to the delay ...   \n",
+       "497  To answer this question, we need to consider the role of vitamins ...   \n",
+       "498  To answer this question, we need to understand the context and the...   \n",
+       "499  The concept described in the question involves immigrant children ...   \n",
+       "\n",
+       "    pred_answer     metric  \n",
+       "0             A  ✔️ [True]  \n",
+       "1             A  ✔️ [True]  \n",
+       "2             C  ✔️ [True]  \n",
+       "3             B  ✔️ [True]  \n",
+       "4             C  ✔️ [True]  \n",
+       "..          ...        ...  \n",
+       "495           D  ✔️ [True]  \n",
+       "496           C             \n",
+       "497           D  ✔️ [True]  \n",
+       "498           D  ✔️ [True]  \n",
+       "499           A  ✔️ [True]  \n",
+       "\n",
+       "[500 rows x 7 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 4.91 s, sys: 587 ms, total: 5.49 s\n",
+      "Wall time: 1min 42s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "score, results, all_scores = evaluate(\n",
+    "    optimized_program,\n",
+    "    devset=testset[:500],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Heavy Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "MAX_BOOTSTRAPPED_DEMOS = 5\n",
+    "MAX_LABELED_DEMOS = 5\n",
+    "OPTIMIZER_MODE = \"heavy\"\n",
+    "optimizer = dspy.MIPROv2(\n",
+    "    metric=benchmark.metric,\n",
+    "    auto=OPTIMIZER_MODE,\n",
+    "    num_threads=NUM_THREADS,\n",
+    "    task_model=TASK_MODEL,\n",
+    "    prompt_model=PROMPT_MODEL,\n",
+    "    max_labeled_demos=MAX_LABELED_DEMOS,\n",
+    "    max_bootstrapped_demos=MAX_BOOTSTRAPPED_DEMOS,\n",
+    "    max_errors\n",
+    "\n",
+    ")\n",
+    "\n",
+    "optimized_program = optimizer.compile(\n",
+    "    program,\n",
+    "    trainset=trainset,\n",
+    "    valset=valset,\n",
+    "    program_aware_proposer=False,\n",
+    "    requires_permission_to_run=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "BEST PROMPT:\n",
+      " self = Predict(StringSignature(question, options -> reasoning, answer\n",
+      "    instructions='You are a helpful assistant.'\n",
+      "    question = Field(annotation=str required=True json_schema_extra={'desc': 'The question to be answered', '__dspy_field_type': 'input', 'prefix': 'Question:'})\n",
+      "    options = Field(annotation=dict required=True json_schema_extra={'desc': 'Dictionary of answer choices', '__dspy_field_type': 'input', 'prefix': 'Options:'})\n",
+      "    reasoning = Field(annotation=str required=True json_schema_extra={'prefix': \"Reasoning: Let's think step by step in order to\", 'desc': '${reasoning}', '__dspy_field_type': 'output'})\n",
+      "    answer = Field(annotation=str required=True json_schema_extra={'desc': 'The correct answer letter', '__dspy_field_type': 'output', 'prefix': 'Answer:'})\n",
+      "))\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"BEST PROMPT:\\n\", optimized_program)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"BEST EXAMPLES:\\n\", optimized_program.predict.demos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "score, results, all_scores = evaluate(\n",
+    "    optimized_program,\n",
+    "    devset=testset,\n",
+    "    display_table=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/end-to-end-use-cases/prompt-migration/notebooks/llama_mmlu_pro.ipynb b/end-to-end-use-cases/prompt-migration/notebooks/llama_mmlu_pro.ipynb
new file mode 100644
index 000000000..857c837d6
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/notebooks/llama_mmlu_pro.ipynb
@@ -0,0 +1,728 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Imports & Env Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2\n",
+    "import sys\n",
+    "import os\n",
+    "from dotenv import load_dotenv\n",
+    "load_dotenv()\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "import dspy\n",
+    "sys.path.append(os.path.abspath('../'))\n",
+    "\n",
+    "from benchmarks import llama_mmlu_pro\n",
+    "from benchmarks.store_llama_mmlu_pro import store_results, store_optimization_results, store_evaluation_results\n",
+    "from benchmarks.statistical_eval import StatisticalEvaluate\n",
+    "\n",
+    "from config import MODEL_CONFIGS\n",
+    "\n",
+    "\n",
+    "import weave\n",
+    "weave.init(project_name=\"mmlu-pro-optimization\")  # You can change the project name as needed"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure Constants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NUM_THREADS = 36\n",
+    "benchmark = llama_mmlu_pro\n",
+    "STATISTICAL_EVAL = False\n",
+    "FEW_SHOT_BASELINE = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure Models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TASK_MODEL_NAME = \"vllm_llama_8b\"\n",
+    "TASK_MODEL_SETTINGS = MODEL_CONFIGS.get(TASK_MODEL_NAME, MODEL_CONFIGS[TASK_MODEL_NAME])\n",
+    "\n",
+    "JUDGE_MODEL_NAME = \"openrouter_gpt4o\"\n",
+    "JUDGE_MODEL_SETTINGS = MODEL_CONFIGS.get(JUDGE_MODEL_NAME, MODEL_CONFIGS[JUDGE_MODEL_NAME])\n",
+    "\n",
+    "TASK_MODEL = dspy.LM(\n",
+    "    TASK_MODEL_SETTINGS[\"model\"],\n",
+    "    api_base=TASK_MODEL_SETTINGS[\"api_base\"],\n",
+    "    api_key=TASK_MODEL_SETTINGS[\"api_key\"],\n",
+    "    cache=False\n",
+    ")\n",
+    "\n",
+    "JUDGE_MODEL = dspy.LM(\n",
+    "    JUDGE_MODEL_SETTINGS[\"model\"],\n",
+    "    api_base=JUDGE_MODEL_SETTINGS[\"api_base\"],\n",
+    "    api_key=JUDGE_MODEL_SETTINGS[\"api_key\"],\n",
+    "    cache=False\n",
+    ")\n",
+    "\n",
+    "dspy.configure(lm=TASK_MODEL)\n",
+    "\n",
+    "\n",
+    "# Create a partial metric function that includes the judge_lm\n",
+    "from functools import partial\n",
+    "metric_with_judge = partial(benchmark.metric, judge_lm=JUDGE_MODEL)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure Prompts and Evals"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'FEW_SHOT_BASELINE' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[43mFEW_SHOT_BASELINE\u001b[49m:\n\u001b[1;32m      2\u001b[0m     program \u001b[38;5;241m=\u001b[39m dspy\u001b[38;5;241m.\u001b[39mChainOfThought(\n\u001b[1;32m      3\u001b[0m        benchmark\u001b[38;5;241m.\u001b[39msignature(\n\u001b[1;32m      4\u001b[0m \u001b[38;5;250m        \u001b[39m\u001b[38;5;124;03m\"\"\"You are a helpful assistant. \"\"\"\u001b[39;00m\n\u001b[1;32m      5\u001b[0m     ))\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'FEW_SHOT_BASELINE' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "if FEW_SHOT_BASELINE:\n",
+    "    program = dspy.ChainOfThought(\n",
+    "       benchmark.signature(\n",
+    "        \"\"\"You are a helpful assistant. Few shots examples:\n",
+    "        1. \n",
+    "        2.\n",
+    "        3.\n",
+    "        4.\n",
+    "        5.\n",
+    "        \"\"\"\n",
+    "    ))\n",
+    "else:\n",
+    "    program = dspy.ChainOfThought(\n",
+    "   benchmark.signature(\n",
+    "    \"\"\"You are a helpful assistant.\"\"\"\n",
+    "))\n",
+    "\n",
+    "\n",
+    "evaluate = dspy.Evaluate(\n",
+    "    devset=[],\n",
+    "    max_errors=500,\n",
+    "    metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL\n",
+    "    num_threads=NUM_THREADS,\n",
+    "    display_progress=True,\n",
+    "    display_table=True,\n",
+    "    return_all_scores=True,\n",
+    "    return_outputs=True,\n",
+    "    provide_traceback=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load entire dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainset, valset, testset = benchmark.datasets(\n",
+    "    train_size=0.25,\n",
+    "    validation_size=0.25,\n",
+    ")\n",
+    "\n",
+    "len(trainset), len(valset), len(testset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "combined_dataset = trainset + valset + testset\n",
+    "len(combined_dataset)\n",
+    "TESTSET = combined_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Baseline Benchmark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "print(\"Starting execution...\")\n",
+    "\n",
+    "if STATISTICAL_EVAL:\n",
+    "    evaluate = StatisticalEvaluate(\n",
+    "        n_runs=5,  # Number of evaluation runs for statistical significance\n",
+    "        confidence_level=0.95,  # 95% confidence interval\n",
+    "        devset=[],\n",
+    "        max_errors=500,\n",
+    "        metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL\n",
+    "        num_threads=NUM_THREADS,\n",
+    "        display_progress=True,\n",
+    "        display_table=True,\n",
+    "        return_all_scores=True,\n",
+    "        return_outputs=True,\n",
+    "    )\n",
+    "    \n",
+    "    # Then modify the evaluation call to:\n",
+    "    \n",
+    "    stats_results = evaluate(\n",
+    "        program,\n",
+    "        devset=TESTSET,\n",
+    "        statistical=True  # Enable statistical evaluation\n",
+    "    )\n",
+    "    \n",
+    "    print(f\"\\nStatistical Results:\")\n",
+    "    print(f\"Mean Score: {stats_results.mean_score:.2f}\")\n",
+    "    print(f\"Standard Deviation: {stats_results.std_dev:.2f}\")\n",
+    "    print(f\"95% Confidence Interval: ({stats_results.confidence_interval[0]:.2f}, {stats_results.confidence_interval[1]:.2f})\")\n",
+    "    print(f\"Number of Runs: {stats_results.n_runs}\")\n",
+    "    print(f\"Sample Size: {stats_results.sample_size}\")\n",
+    "\n",
+    "    run_number = store_results(\n",
+    "        task_model=TASK_MODEL.model,\n",
+    "        judge_model=JUDGE_MODEL.model,\n",
+    "        program=program,  \n",
+    "        stats_results=stats_results,  \n",
+    "    )\n",
+    "\n",
+    "\n",
+    "\n",
+    "else:\n",
+    "    # eval_subset_size = len(testset)\n",
+    "    score, results, all_scores = evaluate(\n",
+    "        program,\n",
+    "        devset=TESTSET,\n",
+    "    )\n",
+    "\n",
+    "    run_number = store_results(\n",
+    "        task_model=TASK_MODEL.model,\n",
+    "        judge_model=JUDGE_MODEL.model,\n",
+    "        program=program,  \n",
+    "        results=results,  \n",
+    "        score=score,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prompt Optimization + Evaluation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Configure Lite Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "FEW_SHOTS = 5\n",
+    "\n",
+    "PROMPT_MODEL_NAME = \"vllm_llama_70b\"\n",
+    "PROMPT_MODEL_SETTINGS = MODEL_CONFIGS.get(PROMPT_MODEL_NAME, MODEL_CONFIGS[PROMPT_MODEL_NAME])\n",
+    "\n",
+    "PROMPT_MODEL = dspy.LM(\n",
+    "    PROMPT_MODEL_SETTINGS[\"model\"],\n",
+    "    api_base=PROMPT_MODEL_SETTINGS[\"api_base\"],\n",
+    "    api_key=PROMPT_MODEL_SETTINGS[\"api_key\"],\n",
+    "    cache=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Lite Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "OPTIMIZER=\"light\"\n",
+    "\n",
+    "optimizer = dspy.MIPROv2(\n",
+    "    metric=benchmark.metric,\n",
+    "    auto=OPTIMIZER,\n",
+    "    num_threads=NUM_THREADS,\n",
+    "    task_model=TASK_MODEL,\n",
+    "    prompt_model=PROMPT_MODEL,\n",
+    "    max_labeled_demos=FEW_SHOTS,\n",
+    "    verbose=True\n",
+    ")\n",
+    "\n",
+    "optimized_program = optimizer.compile(\n",
+    "    program,\n",
+    "    trainset=trainset,\n",
+    "    valset=valset,\n",
+    "    requires_permission_to_run=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_number = store_optimization_results(\n",
+    "    task_model=TASK_MODEL.model,\n",
+    "    judge_model=JUDGE_MODEL.model,\n",
+    "    program=optimized_program,\n",
+    "    optimization=OPTIMIZER\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "print(\"BEST PROMPT:\\n\", optimized_program.signature.instructions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run Evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "print(\"Starting execution...\")\n",
+    "STATISTICAL_EVAL=True\n",
+    "if STATISTICAL_EVAL:\n",
+    "    evaluate = StatisticalEvaluate(\n",
+    "        n_runs=5,  # Number of evaluation runs for statistical significance\n",
+    "        confidence_level=0.95,  # 95% confidence interval\n",
+    "        devset=[],\n",
+    "        max_errors=500,\n",
+    "        metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL\n",
+    "        num_threads=NUM_THREADS,\n",
+    "        display_progress=True,\n",
+    "        display_table=True,\n",
+    "        return_all_scores=True,\n",
+    "        return_outputs=True,\n",
+    "    )\n",
+    "    \n",
+    "    # Run the evaluation\n",
+    "    stats_results = evaluate(\n",
+    "        optimized_program,\n",
+    "        devset=TESTSET,\n",
+    "        statistical=True  # Enable statistical evaluation\n",
+    "    )\n",
+    "    \n",
+    "    # Print statistical results\n",
+    "    print(f\"\\nStatistical Results:\")\n",
+    "    print(f\"Mean Score: {stats_results.mean_score:.2f}\")\n",
+    "    print(f\"Standard Deviation: {stats_results.std_dev:.2f}\")\n",
+    "    print(f\"95% Confidence Interval: ({stats_results.confidence_interval[0]:.2f}, {stats_results.confidence_interval[1]:.2f})\")\n",
+    "    print(f\"Number of Runs: {stats_results.n_runs}\")\n",
+    "    print(f\"Sample Size: {stats_results.sample_size}\")\n",
+    "    \n",
+    "    # Store evaluation results\n",
+    "    store_evaluation_results(\n",
+    "        run_number=run_number,  # Use the run_number from optimization\n",
+    "        task_model=TASK_MODEL.model,\n",
+    "        stats_results=stats_results\n",
+    "    )\n",
+    "\n",
+    "else:\n",
+    "    # Run regular evaluation\n",
+    "    score, results, all_scores = evaluate(\n",
+    "        optimized_program,\n",
+    "        devset=TESTSET,\n",
+    "    )\n",
+    "\n",
+    "    # Store evaluation results\n",
+    "    store_evaluation_results(\n",
+    "        run_number=run_number,  # Use the run_number from optimization\n",
+    "        task_model=TASK_MODEL.model,\n",
+    "        results=results,\n",
+    "        score=score\n",
+    "    )\n",
+    "\n",
+    "print(f\"Completed run_{run_number}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Medium Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "subset_size = 500\n",
+    "optimizer = dspy.MIPROv2(\n",
+    "    metric=benchmark.metric,\n",
+    "    auto=\"medium\",\n",
+    "    num_threads=NUM_THREADS,\n",
+    "    task_model=TASK_MODEL,\n",
+    "    prompt_model=PROMPT_MODEL,\n",
+    "    max_labeled_demos=FEW_SHOTS,\n",
+    ")\n",
+    "\n",
+    "optimized_program = optimizer.compile(\n",
+    "    program,\n",
+    "    trainset=trainset[:subset_size],\n",
+    "    valset=valset[:subset_size],\n",
+    "    requires_permission_to_run=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"BEST PROMPT:\\n\", optimized_program.predict.signature.instructions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"BEST EXAMPLES:\\n\", optimized_program.predict.demos)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Running Benchmark"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "print(\"Starting execution...\")\n",
+    "\n",
+    "if STATISTICAL_EVAL:\n",
+    "    evaluate = StatisticalEvaluate(\n",
+    "        n_runs=5,  # Number of evaluation runs for statistical significance\n",
+    "        confidence_level=0.95,  # 95% confidence interval\n",
+    "        devset=[],\n",
+    "        max_errors=500,\n",
+    "        metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL\n",
+    "        num_threads=NUM_THREADS,\n",
+    "        display_progress=True,\n",
+    "        display_table=True,\n",
+    "        return_all_scores=True,\n",
+    "        return_outputs=True,\n",
+    "    )\n",
+    "    \n",
+    "    # Then modify the evaluation call to:\n",
+    "    \n",
+    "    stats_results = evaluate(\n",
+    "        program,\n",
+    "        devset=TESTSET,\n",
+    "        statistical=True  # Enable statistical evaluation\n",
+    "    )\n",
+    "    \n",
+    "    # Print statistical results\n",
+    "    print(f\"\\nStatistical Results:\")\n",
+    "    print(f\"Mean Score: {stats_results.mean_score:.2f}\")\n",
+    "    print(f\"Standard Deviation: {stats_results.std_dev:.2f}\")\n",
+    "    print(f\"95% Confidence Interval: ({stats_results.confidence_interval[0]:.2f}, {stats_results.confidence_interval[1]:.2f})\")\n",
+    "    print(f\"Number of Runs: {stats_results.n_runs}\")\n",
+    "    print(f\"Sample Size: {stats_results.sample_size}\")\n",
+    "    \n",
+    "    # If you want to compare two programs:\n",
+    "    # stats_results_a, stats_results_b, p_value = evaluate.compare_programs(\n",
+    "    #     program_a=program,\n",
+    "    #     program_b=optimized_program,\n",
+    "    #     devset=testset\n",
+    "    # )\n",
+    "    \n",
+    "    # print(\"\\nProgram Comparison:\")\n",
+    "    # print(f\"Program A Mean Score: {stats_results_a.mean_score:.2f}\")\n",
+    "    # print(f\"Program B Mean Score: {stats_results_b.mean_score:.2f}\")\n",
+    "    # print(f\"P-value: {p_value:.4f}\")\n",
+    "    # print(f\"Statistically Significant: {p_value < 0.05}\")\n",
+    "\n",
+    "    run_number = store_results(\n",
+    "        task_model=TASK_MODEL.model,\n",
+    "        judge_model=JUDGE_MODEL.model,\n",
+    "        program=optimized_program,  \n",
+    "        stats_results=stats_results,  \n",
+    "    )\n",
+    "\n",
+    "\n",
+    "\n",
+    "else:\n",
+    "    # eval_subset_size = len(testset)\n",
+    "    score, results, all_scores = evaluate(\n",
+    "        program,\n",
+    "        devset=TESTSET,\n",
+    "    )\n",
+    "\n",
+    "    run_number = store_results(\n",
+    "        task_model=TASK_MODEL.model,\n",
+    "        judge_model=JUDGE_MODEL.model,\n",
+    "        program=optimized_program,  \n",
+    "        results=results,  \n",
+    "        score=score,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Heavy Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "NUM_THREADS = 48\n",
+    "OPTIMIZER = 'heavy'\n",
+    "\n",
+    "optimizer = dspy.MIPROv2(\n",
+    "    metric=benchmark.metric,\n",
+    "    auto=OPTIMIZER,\n",
+    "    num_threads=NUM_THREADS,\n",
+    "    task_model=TASK_MODEL,\n",
+    "    prompt_model=PROMPT_MODEL,\n",
+    "    max_labeled_demos=FEW_SHOTS,\n",
+    ")\n",
+    "\n",
+    "optimized_program = optimizer.compile(\n",
+    "    program,\n",
+    "    trainset=trainset,\n",
+    "    valset=valset,\n",
+    "    requires_permission_to_run=False,\n",
+    "\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_number = store_optimization_results(\n",
+    "    task_model=TASK_MODEL.model,\n",
+    "    judge_model=JUDGE_MODEL.model,\n",
+    "    program=optimized_program,\n",
+    "    optimization=OPTIMIZER\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"BEST PROMPT:\\n\", optimized_program.signature.instructions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "print(\"BEST EXAMPLES:\\n\", optimized_program.demos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "print(\"Starting execution...\")\n",
+    "\n",
+    "if STATISTICAL_EVAL:\n",
+    "    evaluate = StatisticalEvaluate(\n",
+    "        n_runs=5,  # Number of evaluation runs for statistical significance\n",
+    "        confidence_level=0.95,  # 95% confidence interval\n",
+    "        devset=[],\n",
+    "        max_errors=500,\n",
+    "        metric=metric_with_judge,  # Use the partial function that includes JUDGE_MODEL\n",
+    "        num_threads=NUM_THREADS,\n",
+    "        display_progress=True,\n",
+    "        display_table=True,\n",
+    "        return_all_scores=True,\n",
+    "        return_outputs=True,\n",
+    "    )\n",
+    "    \n",
+    "    # Run the evaluation\n",
+    "    stats_results = evaluate(\n",
+    "        optimized_program,\n",
+    "        devset=TESTSET,\n",
+    "        statistical=True  # Enable statistical evaluation\n",
+    "    )\n",
+    "    \n",
+    "    # Print statistical results\n",
+    "    print(f\"\\nStatistical Results:\")\n",
+    "    print(f\"Mean Score: {stats_results.mean_score:.2f}\")\n",
+    "    print(f\"Standard Deviation: {stats_results.std_dev:.2f}\")\n",
+    "    print(f\"95% Confidence Interval: ({stats_results.confidence_interval[0]:.2f}, {stats_results.confidence_interval[1]:.2f})\")\n",
+    "    print(f\"Number of Runs: {stats_results.n_runs}\")\n",
+    "    print(f\"Sample Size: {stats_results.sample_size}\")\n",
+    "    \n",
+    "    # Store evaluation results\n",
+    "    store_evaluation_results(\n",
+    "        run_number=run_number,  # Use the run_number from optimization\n",
+    "        task_model=TASK_MODEL.model,\n",
+    "        stats_results=stats_results\n",
+    "    )\n",
+    "\n",
+    "else:\n",
+    "    # Run regular evaluation\n",
+    "    score, results, all_scores = evaluate(\n",
+    "        optimized_program,\n",
+    "        devset=TESTSET,\n",
+    "    )\n",
+    "\n",
+    "    # Store evaluation results\n",
+    "    store_evaluation_results(\n",
+    "        run_number=run_number,  # Use the run_number from optimization\n",
+    "        task_model=TASK_MODEL.model,\n",
+    "        results=results,\n",
+    "        score=score\n",
+    "    )\n",
+    "\n",
+    "print(f\"Completed run_{run_number}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/end-to-end-use-cases/prompt-migration/notebooks/start_vllm.py b/end-to-end-use-cases/prompt-migration/notebooks/start_vllm.py
new file mode 100644
index 000000000..4442a2c86
--- /dev/null
+++ b/end-to-end-use-cases/prompt-migration/notebooks/start_vllm.py
@@ -0,0 +1,63 @@
+import os
+import subprocess
+import sys
+
+from config import MODEL_CONFIGS  # Import model configurations
+
+
+def start_vllm(cuda_devices, model_name):
+    """Start vLLM server for the selected model with user-defined CUDA settings."""
+    if model_name not in MODEL_CONFIGS:
+        print(f"Error: Model '{model_name}' not found in config.")
+        print("Available models:", ", ".join(MODEL_CONFIGS.keys()))
+        sys.exit(1)
+
+    MODEL_SETTINGS = MODEL_CONFIGS[model_name]
+
+    model_path = MODEL_SETTINGS["model"].replace("hosted_vllm/", "")
+    api_base = MODEL_SETTINGS["api_base"]
+    port = MODEL_SETTINGS["port"]
+    tensor_parallel = MODEL_SETTINGS["tensor_parallel"]
+    gpu_util = MODEL_SETTINGS["gpu_util"]
+    chat_template = MODEL_SETTINGS.get("chat_template", None)
+
+    # Use provided CUDA devices or default from config
+    os.environ["CUDA_VISIBLE_DEVICES"] = (
+        cuda_devices if cuda_devices != "default" else MODEL_SETTINGS["cuda_devices"]
+    )
+    print(f"Using CUDA devices: {os.environ['CUDA_VISIBLE_DEVICES']}")
+
+    # Build vLLM serve command
+    vllm_command = (
+        f"vllm serve {model_path} "
+        f"--port {port} "
+        f"--tensor-parallel-size {tensor_parallel} "
+        f"--gpu-memory-utilization {gpu_util} "
+    )
+
+    # Add chat template flag if required
+    if chat_template:
+        vllm_command += f"--chat-template {chat_template} "
+
+    print(f"Starting vLLM server for model: {model_name}")
+    print(f"Running command: {vllm_command}")
+
+    # Run the command in a new process
+    subprocess.run(vllm_command, shell=True)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print(
+            "Usage: CUDA_VISIBLE_DEVICES=<devices> python start_vllm.py <cuda_devices> <model_name>"
+        )
+        print(
+            "Example: CUDA_VISIBLE_DEVICES=0,1 python start_vllm.py 0,1 vllm_llama_405b"
+        )
+        print("Available models:", ", ".join(MODEL_CONFIGS.keys()))
+        sys.exit(1)
+
+    cuda_devices = sys.argv[1]
+    model_name = sys.argv[2]
+
+    start_vllm(cuda_devices, model_name)