posit-dev
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎chatlas/_chat.py‎
Lines changed: 328 additions & 1 deletion b/‎chatlas/_chat.py‎
Lines changed: 328 additions & 1 deletion
@@ -7,6 +7,7 @@ Untitled*.ipynb
 uv.lock
 
 sandbox/
+logs/
 
 /.luarc.json
 
 
@@ -9,6 +9,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [UNRELEASED]
 
+### New features
+
+* Added support systematic evaluation via [Inspect AI](https://inspect.aisi.org.uk/). This includes:
+    * A new `.export_eval()` method for exporting conversation history as an Inspect eval dataset sample. This supports multi-turn conversations, tool calls, images, PDFs, and structured data.
+    * A new `.to_solver()` method for translating chat instances into Inspect solvers that can be used with Inspect's evaluation framework.
+    * A new `Turn.to_inspect_messages()` method for converting turns to Inspect's message format.
+    * Comprehensive documentation in the [Evals guide](https://posit-dev.github.io/chatlas/misc/evals.html).
+
 
 
 ## [0.13.2] - 2025-10-02
 
@@ -4,6 +4,7 @@
 import inspect
 import os
 import sys
+import time
 import traceback
 import warnings
 from pathlib import Path
@@ -53,6 +54,9 @@
 from ._utils import MISSING, MISSING_TYPE, html_escape, wrap_async
 
 if TYPE_CHECKING:
+    from inspect_ai.model import ChatMessage as InspectChatMessage
+    from inspect_ai.solver import TaskState as InspectTaskState
+
     from ._content import ToolAnnotations
 
 
@@ -807,6 +811,189 @@ def console(
             self.chat(user_input, echo=echo, stream=stream, kwargs=kwargs)
             print("")
 
+    def to_solver(
+        self,
+        *,
+        include_system_prompt: bool = False,
+        include_turns: bool = False,
+    ):
+        """
+        Create an InspectAI solver from this chat.
+
+        Translates this Chat instance into an InspectAI solver function that can
+        be used with InspectAI's evaluation framework. This solver will capture
+        (and translate) important state from the chat, including the model,
+        system prompt, previous turns, registered tools, model parameters, etc.
+
+        Parameters
+        ----------
+        include_system_prompt
+            Whether to include the system prompt in the solver's starting
+            messages.
+        include_turns
+            Whether to include the chat's existing turns in the solver's
+            starting messages.
+
+        Note
+        ----
+        Both `include_system_prompt` and `include_turns` default to `False` since
+        `.export_eval()` captures this information already. Therefore,
+        including them here would lead to duplication of context in the
+        evaluation. However, in some cases you may want to include them, for
+        example if you are manually constructing an evaluation dataset that
+        does not include this information. Or, if you want to always have the
+        same starting context regardless of the evaluation dataset.
+
+        Returns
+        -------
+        An [InspectAI solver](https://inspect.ai-safety-institute.org.uk/solvers.html)
+        function that can be used with InspectAI's evaluation framework.
+
+        Examples
+        --------
+        First, put this code in a python script, perhaps named `eval_chat.py`
+
+        ```{.python filename="eval_chat.py"}
+        from chatlas import ChatOpenAI
+        from inspect_ai import Task, task
+        from inspect_ai.dataset import csv_dataset
+        from inspect_ai.scorer import model_graded_qa
+
+        chat = ChatOpenAI(system_prompt="You are a helpful assistant.")
+
+        @task
+        def my_eval(grader_model: str = "openai/gpt-4o"):
+            return Task(
+                dataset=csv_dataset("my_eval_dataset.csv"),
+                solver=chat.to_solver(),
+                scorer=model_graded_qa(model=grader_model)
+            )
+        ```
+
+        Then run the evaluation with InspectAI's CLI:
+
+        ```bash
+        inspect eval eval_chat.py -T --grader-model openai/gpt-4o
+        ```
+
+        Note
+        ----
+        Learn more about this method and InspectAI's evaluation framework
+        in the [Chatlas documentation](https://posit-dev.github.io/chatlas/misc/evals.html).
+        """
+
+        from ._inspect import (
+            inspect_content_as_chatlas,
+            inspect_messages_as_turns,
+            try_import_inspect,
+            turn_as_inspect_messages,
+        )
+
+        (imodel, isolver, _) = try_import_inspect()
+
+        # Create a copy of the chat to avoid modifying its state
+        # when inspect runs the solver
+        chat_instance = copy.deepcopy(self)
+        model = self.provider.model
+
+        # Remove existing turns if requested
+        if not include_turns:
+            chat_instance.set_turns([])
+
+        # Prepare the starting messages from the chat instance
+        starting_turns = chat_instance.get_turns(
+            include_system_prompt=include_system_prompt
+        )
+
+        # Translate starting turns to Inspect messages
+        starting_messages: list["InspectChatMessage"] = []
+        for turn in starting_turns:
+            starting_messages.extend(turn.to_inspect_messages(model))
+
+        # Since Inspect preserves state, across solves, prepend starting messages only once
+        has_starting_messages = False
+
+        @isolver.solver(f"chatlas_{self.provider.name}_{model}")
+        def _solver():
+            async def solve(state: InspectTaskState, generate):
+                nonlocal has_starting_messages
+                start_time = time.perf_counter()
+
+                if not has_starting_messages:
+                    state.messages = starting_messages + state.messages
+                    has_starting_messages = True
+
+                # Now that we've translated the starting messages to Inspect,
+                # we translate the message state back to the chat instance.
+                # N.B., state.message can include non-trivial dataset of sample input
+                # (e.g., `Sample(input=[ChatMessage, ...])`)
+                system_messages: list["InspectChatMessage"] = []
+                other_messages: list["InspectChatMessage"] = []
+                user_prompt: "InspectChatMessage | None" = None
+                for x in reversed(state.messages):
+                    if x.role == "user" and user_prompt is None:
+                        user_prompt = x
+                    elif x.role == "system":
+                        system_messages.append(x)
+                    else:
+                        other_messages.append(x)
+
+                other_messages.reverse()
+
+                # Set the system prompt on the chat instance
+                if len(system_messages) == 1:
+                    chat_instance.system_prompt = str(system_messages[0])
+                elif len(system_messages) > 1:
+                    raise ValueError(
+                        "Multiple system prompts detected in `.to_solver()`, but chatlas only "
+                        "supports a single system prompt. This usually indicates that the system "
+                        "prompt is mistakenly included in both the eval dataset (via `.export_eval()`) "
+                        "and on the chat instance. Consider dropping the system prompt from "
+                        "the chat instance by setting `.to_solver(include_system_prompt=False)`."
+                    )
+
+                # Now, set the other messages as turns on the chat instance
+                chat_instance.set_turns(inspect_messages_as_turns(other_messages))
+
+                if user_prompt is None:
+                    raise ValueError("No user prompt found in InspectAI state messages")
+
+                input_content = [inspect_content_as_chatlas(x) for x in user_prompt.content]
+
+                await chat_instance.chat_async(*input_content, echo="none")
+                last_turn = chat_instance.get_last_turn(role="assistant")
+                if last_turn is None:
+                    raise ValueError("No assistant turn found after chat completion")
+
+                last_turn_message = turn_as_inspect_messages(
+                    last_turn, "assistant", model
+                )[0]
+                state.messages.append(last_turn_message)
+
+                tokens = last_turn.tokens
+                if tokens is None:
+                    usage = None
+                else:
+                    usage = imodel.ModelUsage(
+                        input_tokens=tokens[0],
+                        output_tokens=tokens[1],
+                        total_tokens=tokens[0] + tokens[1],
+                        input_tokens_cache_read=tokens[2],
+                    )
+
+                state.output = imodel.ModelOutput(
+                    model=model,
+                    choices=[imodel.ChatCompletionChoice(message=last_turn_message)],
+                    completion=last_turn.text,
+                    usage=usage,
+                    time=time.perf_counter() - start_time,
+                )
+                return state
+
+            return solve
+
+        return _solver()
+
     def chat(
         self,
         *args: Content | str,
@@ -1882,7 +2069,11 @@ def get_current_weather(latitude: float, longitude: float):
             "Get the current weather given a latitude and longitude."
 
             lat_lng = f"latitude={latitude}&longitude={longitude}"
-            url = f"https://api.open-meteo.com/v1/forecast?{lat_lng}&current=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m"
+            url = (
+                "https://api.open-meteo.com/v1/forecast?"
+                f"{lat_lng}&current=temperature_2m,wind_speed_10m"
+                "&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m"
+            )
             response = requests.get(url)
             json = response.json()
             if chat.current_display:
@@ -2030,6 +2221,142 @@ def _html_template(contents: str) -> str:
         </html>
         """
 
+    def export_eval(
+        self,
+        filename: str | Path,
+        *,
+        target: Optional[str] = None,
+        include_system_prompt: bool = True,
+        turns: Optional[list[Turn]] = None,
+        overwrite: Literal["append", True, False] = "append",
+        **kwargs: Any,
+    ):
+        """
+        Creates an Inspect AI eval dataset sample from the current chat.
+
+        Creates an Inspect AI eval
+        [Sample](https://inspect.aisi.org.uk/reference/inspect_ai.dataset.html#sample)
+        from the current chat and appends it to a JSONL file. In Inspect, a eval
+        dataset is a collection of Samples, where each Sample represents a
+        single `input` (i.e., user prompt) and the expected `target` (i.e., the
+        target answer and/or grading guidance for it). Note that each `input` of
+        a particular sample can contain a series of messages (from both the user
+        and assistant).
+
+        Note
+        ----
+        Each call to this method appends a single Sample as a new line in the
+        specified JSONL file. If the file does not exist, it will be created.
+
+        Parameters
+        ----------
+        filename
+            The filename to export the chat to. Currently this must
+            be a `.jsonl` file.
+        target
+            The target output for the eval sample. By default, this is
+            taken to be the content of the last assistant turn.
+        include_system_prompt
+            Whether to include the system prompt (if any) as the
+            first turn in the eval sample.
+        turns
+            The input turns for the eval sample. By default, this is
+            taken to be all turns except the last (assistant) turn.
+            Note that system prompts are not allowed here, but controlled
+            separately via the `include_system_prompt` parameter.
+        overwrite
+            Behavior when the file already exists:
+              - `"append"` (default): Append to the existing file.
+              - `True`: Overwrite the existing file.
+              - `False`: Raise an error if the file already exists.
+        kwargs
+            Additional keyword arguments to pass to the `Sample()` constructor.
+            This is primarily useful for setting an ID or metadata on the sample.
+
+        Examples
+        --------
+
+        Step 1: export the chat to an eval JSONL file
+
+        ```python
+        from chatlas import ChatOpenAI
+
+        chat = ChatOpenAI(system_prompt="You are a helpful assistant.")
+        chat.chat("Hello, how are you?")
+
+        chat.export_eval("my_eval_1.jsonl")
+        ```
+
+        Step 2: load the eval JSONL file into an Inspect AI eval task
+
+        ```python
+        from chatlas import ChatOpenAI
+        from inspect_ai import Task, task
+        from inspect_ai.dataset import json_dataset
+        from inspect_ai.scorer import model_graded_qa
+
+        # No need to load in system prompt -- it's included in the eval JSONL file by default
+        chat = ChatOpenAI()
+
+
+        @task
+        def my_eval():
+            return Task(
+                dataset=json_dataset("my_eval.jsonl"),
+                solver=chat.to_solver(),
+                scorer=model_graded_qa(model="openai/gpt-4o-mini"),
+            )
+        ```
+        """
+
+        if isinstance(filename, str):
+            filename = Path(filename)
+
+        filename = filename.resolve()
+        if filename.exists() and overwrite is False:
+            raise ValueError(
+                f"File {filename} already exists. Set `overwrite=True` to overwrite or `overwrite='append'` to append."
+            )
+
+        if filename.suffix not in {".jsonl"}:
+            raise ValueError("The filename must have a `.jsonl` extension.")
+
+        if turns is None:
+            turns = self.get_turns(include_system_prompt=False)
+
+        if any(x.role == "system" for x in turns):
+            raise ValueError("System prompts are not allowed in eval input turns.")
+
+        if not any(x.role == "user" for x in turns):
+            raise ValueError("At least one user turn is required in eval input turns.")
+
+        if include_system_prompt:
+            system_turn = self.get_last_turn(role="system")
+            if system_turn:
+                turns = [system_turn] + turns
+
+        input_turns, target_turn = turns[:-1], turns[-1]
+        if target_turn.role != "assistant":
+            raise ValueError("The last turn must be an assistant turn.")
+
+        if target is None:
+            target = str(target_turn)
+
+        input_messages = []
+        for x in input_turns:
+            input_messages.extend(x.to_inspect_messages(self.provider.model))
+
+        from inspect_ai.dataset import Sample
+
+        sample = Sample(input=input_messages, target=target, **kwargs)
+        sample_json = sample.model_dump_json(exclude_none=True)
+
+        mode = "a" if overwrite == "append" and filename.exists() else "w"
+        with open(filename, mode) as f:
+            f.write(sample_json + "\n")
+
+        return filename
+
     @overload
     def _chat_impl(
         self,