|
4 | 4 | import inspect |
5 | 5 | import os |
6 | 6 | import sys |
| 7 | +import time |
7 | 8 | import traceback |
8 | 9 | import warnings |
9 | 10 | from pathlib import Path |
|
53 | 54 | from ._utils import MISSING, MISSING_TYPE, html_escape, wrap_async |
54 | 55 |
|
55 | 56 | if TYPE_CHECKING: |
| 57 | + from inspect_ai.model import ChatMessage as InspectChatMessage |
| 58 | + from inspect_ai.solver import TaskState as InspectTaskState |
| 59 | + |
56 | 60 | from ._content import ToolAnnotations |
57 | 61 |
|
58 | 62 |
|
@@ -807,6 +811,189 @@ def console( |
807 | 811 | self.chat(user_input, echo=echo, stream=stream, kwargs=kwargs) |
808 | 812 | print("") |
809 | 813 |
|
| 814 | + def to_solver( |
| 815 | + self, |
| 816 | + *, |
| 817 | + include_system_prompt: bool = False, |
| 818 | + include_turns: bool = False, |
| 819 | + ): |
| 820 | + """ |
| 821 | + Create an InspectAI solver from this chat. |
| 822 | +
|
| 823 | + Translates this Chat instance into an InspectAI solver function that can |
| 824 | + be used with InspectAI's evaluation framework. This solver will capture |
| 825 | + (and translate) important state from the chat, including the model, |
| 826 | + system prompt, previous turns, registered tools, model parameters, etc. |
| 827 | +
|
| 828 | + Parameters |
| 829 | + ---------- |
| 830 | + include_system_prompt |
| 831 | + Whether to include the system prompt in the solver's starting |
| 832 | + messages. |
| 833 | + include_turns |
| 834 | + Whether to include the chat's existing turns in the solver's |
| 835 | + starting messages. |
| 836 | +
|
| 837 | + Note |
| 838 | + ---- |
| 839 | + Both `include_system_prompt` and `include_turns` default to `False` since |
| 840 | + `.export_eval()` captures this information already. Therefore, |
| 841 | + including them here would lead to duplication of context in the |
| 842 | + evaluation. However, in some cases you may want to include them, for |
| 843 | + example if you are manually constructing an evaluation dataset that |
| 844 | + does not include this information. Or, if you want to always have the |
| 845 | + same starting context regardless of the evaluation dataset. |
| 846 | +
|
| 847 | + Returns |
| 848 | + ------- |
| 849 | + An [InspectAI solver](https://inspect.ai-safety-institute.org.uk/solvers.html) |
| 850 | + function that can be used with InspectAI's evaluation framework. |
| 851 | +
|
| 852 | + Examples |
| 853 | + -------- |
| 854 | + First, put this code in a python script, perhaps named `eval_chat.py` |
| 855 | +
|
| 856 | + ```{.python filename="eval_chat.py"} |
| 857 | + from chatlas import ChatOpenAI |
| 858 | + from inspect_ai import Task, task |
| 859 | + from inspect_ai.dataset import csv_dataset |
| 860 | + from inspect_ai.scorer import model_graded_qa |
| 861 | +
|
| 862 | + chat = ChatOpenAI(system_prompt="You are a helpful assistant.") |
| 863 | +
|
| 864 | + @task |
| 865 | + def my_eval(grader_model: str = "openai/gpt-4o"): |
| 866 | + return Task( |
| 867 | + dataset=csv_dataset("my_eval_dataset.csv"), |
| 868 | + solver=chat.to_solver(), |
| 869 | + scorer=model_graded_qa(model=grader_model) |
| 870 | + ) |
| 871 | + ``` |
| 872 | +
|
| 873 | + Then run the evaluation with InspectAI's CLI: |
| 874 | +
|
| 875 | + ```bash |
| 876 | + inspect eval eval_chat.py -T --grader-model openai/gpt-4o |
| 877 | + ``` |
| 878 | +
|
| 879 | + Note |
| 880 | + ---- |
| 881 | + Learn more about this method and InspectAI's evaluation framework |
| 882 | + in the [Chatlas documentation](https://posit-dev.github.io/chatlas/misc/evals.html). |
| 883 | + """ |
| 884 | + |
| 885 | + from ._inspect import ( |
| 886 | + inspect_content_as_chatlas, |
| 887 | + inspect_messages_as_turns, |
| 888 | + try_import_inspect, |
| 889 | + turn_as_inspect_messages, |
| 890 | + ) |
| 891 | + |
| 892 | + (imodel, isolver, _) = try_import_inspect() |
| 893 | + |
| 894 | + # Create a copy of the chat to avoid modifying its state |
| 895 | + # when inspect runs the solver |
| 896 | + chat_instance = copy.deepcopy(self) |
| 897 | + model = self.provider.model |
| 898 | + |
| 899 | + # Remove existing turns if requested |
| 900 | + if not include_turns: |
| 901 | + chat_instance.set_turns([]) |
| 902 | + |
| 903 | + # Prepare the starting messages from the chat instance |
| 904 | + starting_turns = chat_instance.get_turns( |
| 905 | + include_system_prompt=include_system_prompt |
| 906 | + ) |
| 907 | + |
| 908 | + # Translate starting turns to Inspect messages |
| 909 | + starting_messages: list["InspectChatMessage"] = [] |
| 910 | + for turn in starting_turns: |
| 911 | + starting_messages.extend(turn.to_inspect_messages(model)) |
| 912 | + |
| 913 | + # Since Inspect preserves state, across solves, prepend starting messages only once |
| 914 | + has_starting_messages = False |
| 915 | + |
| 916 | + @isolver.solver(f"chatlas_{self.provider.name}_{model}") |
| 917 | + def _solver(): |
| 918 | + async def solve(state: InspectTaskState, generate): |
| 919 | + nonlocal has_starting_messages |
| 920 | + start_time = time.perf_counter() |
| 921 | + |
| 922 | + if not has_starting_messages: |
| 923 | + state.messages = starting_messages + state.messages |
| 924 | + has_starting_messages = True |
| 925 | + |
| 926 | + # Now that we've translated the starting messages to Inspect, |
| 927 | + # we translate the message state back to the chat instance. |
| 928 | + # N.B., state.message can include non-trivial dataset of sample input |
| 929 | + # (e.g., `Sample(input=[ChatMessage, ...])`) |
| 930 | + system_messages: list["InspectChatMessage"] = [] |
| 931 | + other_messages: list["InspectChatMessage"] = [] |
| 932 | + user_prompt: "InspectChatMessage | None" = None |
| 933 | + for x in reversed(state.messages): |
| 934 | + if x.role == "user" and user_prompt is None: |
| 935 | + user_prompt = x |
| 936 | + elif x.role == "system": |
| 937 | + system_messages.append(x) |
| 938 | + else: |
| 939 | + other_messages.append(x) |
| 940 | + |
| 941 | + other_messages.reverse() |
| 942 | + |
| 943 | + # Set the system prompt on the chat instance |
| 944 | + if len(system_messages) == 1: |
| 945 | + chat_instance.system_prompt = str(system_messages[0]) |
| 946 | + elif len(system_messages) > 1: |
| 947 | + raise ValueError( |
| 948 | + "Multiple system prompts detected in `.to_solver()`, but chatlas only " |
| 949 | + "supports a single system prompt. This usually indicates that the system " |
| 950 | + "prompt is mistakenly included in both the eval dataset (via `.export_eval()`) " |
| 951 | + "and on the chat instance. Consider dropping the system prompt from " |
| 952 | + "the chat instance by setting `.to_solver(include_system_prompt=False)`." |
| 953 | + ) |
| 954 | + |
| 955 | + # Now, set the other messages as turns on the chat instance |
| 956 | + chat_instance.set_turns(inspect_messages_as_turns(other_messages)) |
| 957 | + |
| 958 | + if user_prompt is None: |
| 959 | + raise ValueError("No user prompt found in InspectAI state messages") |
| 960 | + |
| 961 | + input_content = [inspect_content_as_chatlas(x) for x in user_prompt.content] |
| 962 | + |
| 963 | + await chat_instance.chat_async(*input_content, echo="none") |
| 964 | + last_turn = chat_instance.get_last_turn(role="assistant") |
| 965 | + if last_turn is None: |
| 966 | + raise ValueError("No assistant turn found after chat completion") |
| 967 | + |
| 968 | + last_turn_message = turn_as_inspect_messages( |
| 969 | + last_turn, "assistant", model |
| 970 | + )[0] |
| 971 | + state.messages.append(last_turn_message) |
| 972 | + |
| 973 | + tokens = last_turn.tokens |
| 974 | + if tokens is None: |
| 975 | + usage = None |
| 976 | + else: |
| 977 | + usage = imodel.ModelUsage( |
| 978 | + input_tokens=tokens[0], |
| 979 | + output_tokens=tokens[1], |
| 980 | + total_tokens=tokens[0] + tokens[1], |
| 981 | + input_tokens_cache_read=tokens[2], |
| 982 | + ) |
| 983 | + |
| 984 | + state.output = imodel.ModelOutput( |
| 985 | + model=model, |
| 986 | + choices=[imodel.ChatCompletionChoice(message=last_turn_message)], |
| 987 | + completion=last_turn.text, |
| 988 | + usage=usage, |
| 989 | + time=time.perf_counter() - start_time, |
| 990 | + ) |
| 991 | + return state |
| 992 | + |
| 993 | + return solve |
| 994 | + |
| 995 | + return _solver() |
| 996 | + |
810 | 997 | def chat( |
811 | 998 | self, |
812 | 999 | *args: Content | str, |
@@ -1882,7 +2069,11 @@ def get_current_weather(latitude: float, longitude: float): |
1882 | 2069 | "Get the current weather given a latitude and longitude." |
1883 | 2070 |
|
1884 | 2071 | lat_lng = f"latitude={latitude}&longitude={longitude}" |
1885 | | - url = f"https://api.open-meteo.com/v1/forecast?{lat_lng}¤t=temperature_2m,wind_speed_10m&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m" |
| 2072 | + url = ( |
| 2073 | + "https://api.open-meteo.com/v1/forecast?" |
| 2074 | + f"{lat_lng}¤t=temperature_2m,wind_speed_10m" |
| 2075 | + "&hourly=temperature_2m,relative_humidity_2m,wind_speed_10m" |
| 2076 | + ) |
1886 | 2077 | response = requests.get(url) |
1887 | 2078 | json = response.json() |
1888 | 2079 | if chat.current_display: |
@@ -2030,6 +2221,142 @@ def _html_template(contents: str) -> str: |
2030 | 2221 | </html> |
2031 | 2222 | """ |
2032 | 2223 |
|
| 2224 | + def export_eval( |
| 2225 | + self, |
| 2226 | + filename: str | Path, |
| 2227 | + *, |
| 2228 | + target: Optional[str] = None, |
| 2229 | + include_system_prompt: bool = True, |
| 2230 | + turns: Optional[list[Turn]] = None, |
| 2231 | + overwrite: Literal["append", True, False] = "append", |
| 2232 | + **kwargs: Any, |
| 2233 | + ): |
| 2234 | + """ |
| 2235 | + Creates an Inspect AI eval dataset sample from the current chat. |
| 2236 | +
|
| 2237 | + Creates an Inspect AI eval |
| 2238 | + [Sample](https://inspect.aisi.org.uk/reference/inspect_ai.dataset.html#sample) |
| 2239 | + from the current chat and appends it to a JSONL file. In Inspect, a eval |
| 2240 | + dataset is a collection of Samples, where each Sample represents a |
| 2241 | + single `input` (i.e., user prompt) and the expected `target` (i.e., the |
| 2242 | + target answer and/or grading guidance for it). Note that each `input` of |
| 2243 | + a particular sample can contain a series of messages (from both the user |
| 2244 | + and assistant). |
| 2245 | +
|
| 2246 | + Note |
| 2247 | + ---- |
| 2248 | + Each call to this method appends a single Sample as a new line in the |
| 2249 | + specified JSONL file. If the file does not exist, it will be created. |
| 2250 | +
|
| 2251 | + Parameters |
| 2252 | + ---------- |
| 2253 | + filename |
| 2254 | + The filename to export the chat to. Currently this must |
| 2255 | + be a `.jsonl` file. |
| 2256 | + target |
| 2257 | + The target output for the eval sample. By default, this is |
| 2258 | + taken to be the content of the last assistant turn. |
| 2259 | + include_system_prompt |
| 2260 | + Whether to include the system prompt (if any) as the |
| 2261 | + first turn in the eval sample. |
| 2262 | + turns |
| 2263 | + The input turns for the eval sample. By default, this is |
| 2264 | + taken to be all turns except the last (assistant) turn. |
| 2265 | + Note that system prompts are not allowed here, but controlled |
| 2266 | + separately via the `include_system_prompt` parameter. |
| 2267 | + overwrite |
| 2268 | + Behavior when the file already exists: |
| 2269 | + - `"append"` (default): Append to the existing file. |
| 2270 | + - `True`: Overwrite the existing file. |
| 2271 | + - `False`: Raise an error if the file already exists. |
| 2272 | + kwargs |
| 2273 | + Additional keyword arguments to pass to the `Sample()` constructor. |
| 2274 | + This is primarily useful for setting an ID or metadata on the sample. |
| 2275 | +
|
| 2276 | + Examples |
| 2277 | + -------- |
| 2278 | +
|
| 2279 | + Step 1: export the chat to an eval JSONL file |
| 2280 | +
|
| 2281 | + ```python |
| 2282 | + from chatlas import ChatOpenAI |
| 2283 | +
|
| 2284 | + chat = ChatOpenAI(system_prompt="You are a helpful assistant.") |
| 2285 | + chat.chat("Hello, how are you?") |
| 2286 | +
|
| 2287 | + chat.export_eval("my_eval_1.jsonl") |
| 2288 | + ``` |
| 2289 | +
|
| 2290 | + Step 2: load the eval JSONL file into an Inspect AI eval task |
| 2291 | +
|
| 2292 | + ```python |
| 2293 | + from chatlas import ChatOpenAI |
| 2294 | + from inspect_ai import Task, task |
| 2295 | + from inspect_ai.dataset import json_dataset |
| 2296 | + from inspect_ai.scorer import model_graded_qa |
| 2297 | +
|
| 2298 | + # No need to load in system prompt -- it's included in the eval JSONL file by default |
| 2299 | + chat = ChatOpenAI() |
| 2300 | +
|
| 2301 | +
|
| 2302 | + @task |
| 2303 | + def my_eval(): |
| 2304 | + return Task( |
| 2305 | + dataset=json_dataset("my_eval.jsonl"), |
| 2306 | + solver=chat.to_solver(), |
| 2307 | + scorer=model_graded_qa(model="openai/gpt-4o-mini"), |
| 2308 | + ) |
| 2309 | + ``` |
| 2310 | + """ |
| 2311 | + |
| 2312 | + if isinstance(filename, str): |
| 2313 | + filename = Path(filename) |
| 2314 | + |
| 2315 | + filename = filename.resolve() |
| 2316 | + if filename.exists() and overwrite is False: |
| 2317 | + raise ValueError( |
| 2318 | + f"File {filename} already exists. Set `overwrite=True` to overwrite or `overwrite='append'` to append." |
| 2319 | + ) |
| 2320 | + |
| 2321 | + if filename.suffix not in {".jsonl"}: |
| 2322 | + raise ValueError("The filename must have a `.jsonl` extension.") |
| 2323 | + |
| 2324 | + if turns is None: |
| 2325 | + turns = self.get_turns(include_system_prompt=False) |
| 2326 | + |
| 2327 | + if any(x.role == "system" for x in turns): |
| 2328 | + raise ValueError("System prompts are not allowed in eval input turns.") |
| 2329 | + |
| 2330 | + if not any(x.role == "user" for x in turns): |
| 2331 | + raise ValueError("At least one user turn is required in eval input turns.") |
| 2332 | + |
| 2333 | + if include_system_prompt: |
| 2334 | + system_turn = self.get_last_turn(role="system") |
| 2335 | + if system_turn: |
| 2336 | + turns = [system_turn] + turns |
| 2337 | + |
| 2338 | + input_turns, target_turn = turns[:-1], turns[-1] |
| 2339 | + if target_turn.role != "assistant": |
| 2340 | + raise ValueError("The last turn must be an assistant turn.") |
| 2341 | + |
| 2342 | + if target is None: |
| 2343 | + target = str(target_turn) |
| 2344 | + |
| 2345 | + input_messages = [] |
| 2346 | + for x in input_turns: |
| 2347 | + input_messages.extend(x.to_inspect_messages(self.provider.model)) |
| 2348 | + |
| 2349 | + from inspect_ai.dataset import Sample |
| 2350 | + |
| 2351 | + sample = Sample(input=input_messages, target=target, **kwargs) |
| 2352 | + sample_json = sample.model_dump_json(exclude_none=True) |
| 2353 | + |
| 2354 | + mode = "a" if overwrite == "append" and filename.exists() else "w" |
| 2355 | + with open(filename, mode) as f: |
| 2356 | + f.write(sample_json + "\n") |
| 2357 | + |
| 2358 | + return filename |
| 2359 | + |
2033 | 2360 | @overload |
2034 | 2361 | def _chat_impl( |
2035 | 2362 | self, |
|
0 commit comments