diff --git a/python/.gitignore b/python/.gitignore index 6e128526e14c..d7d27c394883 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -157,7 +157,7 @@ cython_debug/ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ +.idea/ .ruff_cache/ diff --git a/python/packages/autogen-core/docs/src/reference/index.md b/python/packages/autogen-core/docs/src/reference/index.md index ffa2581ba96d..29850dc3de61 100644 --- a/python/packages/autogen-core/docs/src/reference/index.md +++ b/python/packages/autogen-core/docs/src/reference/index.md @@ -67,4 +67,6 @@ python/autogen_ext.cache_store.diskcache python/autogen_ext.cache_store.redis python/autogen_ext.runtimes.grpc python/autogen_ext.auth.azure +python/autogen_ext.experimental.task_centric_memory +python/autogen_ext.experimental.task_centric_memory.utils ``` diff --git a/python/packages/autogen-core/docs/src/reference/python/autogen_ext.experimental.task_centric_memory.rst b/python/packages/autogen-core/docs/src/reference/python/autogen_ext.experimental.task_centric_memory.rst new file mode 100644 index 000000000000..e4bf80c70abb --- /dev/null +++ b/python/packages/autogen-core/docs/src/reference/python/autogen_ext.experimental.task_centric_memory.rst @@ -0,0 +1,8 @@ +autogen\_ext.experimental.task\_centric\_memory +=============================================== + + +.. automodule:: autogen_ext.experimental.task_centric_memory + :members: + :undoc-members: + :show-inheritance: diff --git a/python/packages/autogen-core/docs/src/reference/python/autogen_ext.experimental.task_centric_memory.utils.rst b/python/packages/autogen-core/docs/src/reference/python/autogen_ext.experimental.task_centric_memory.utils.rst new file mode 100644 index 000000000000..c5f7a58bfead --- /dev/null +++ b/python/packages/autogen-core/docs/src/reference/python/autogen_ext.experimental.task_centric_memory.utils.rst @@ -0,0 +1,8 @@ +autogen\_ext.experimental.task\_centric\_memory.utils +===================================================== + + +.. automodule:: autogen_ext.experimental.task_centric_memory.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/python/packages/autogen-ext/imgs/task_centric_memory.png b/python/packages/autogen-ext/imgs/task_centric_memory.png new file mode 100644 index 000000000000..763b4b3cfbf0 --- /dev/null +++ b/python/packages/autogen-ext/imgs/task_centric_memory.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d5d3cdaa77c863ecbeec41ce988c1018d49b2e914a9b3775f6574ea4bbbcee +size 37076 diff --git a/python/packages/autogen-ext/imgs/task_centric_memory_2.png b/python/packages/autogen-ext/imgs/task_centric_memory_2.png new file mode 100644 index 000000000000..1aed539683e2 --- /dev/null +++ b/python/packages/autogen-ext/imgs/task_centric_memory_2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:119f7baf93e71fee417d1a9f9f994f6b3d4fbbc5aae930096a6897e755167e61 +size 28253 diff --git a/python/packages/autogen-ext/imgs/task_centric_memory_3.png b/python/packages/autogen-ext/imgs/task_centric_memory_3.png new file mode 100644 index 000000000000..7674512390b8 --- /dev/null +++ b/python/packages/autogen-ext/imgs/task_centric_memory_3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2af80416085182ba099e5094014f37b7f88daf972dce704d862540566a52bb9 +size 30082 diff --git a/python/packages/autogen-ext/pyproject.toml b/python/packages/autogen-ext/pyproject.toml index 30d33a354cbb..799ae48cfc85 100644 --- a/python/packages/autogen-ext/pyproject.toml +++ b/python/packages/autogen-ext/pyproject.toml @@ -68,6 +68,8 @@ jupyter-executor = [ "nbclient>=0.10.2", ] +task-centric-memory = ["chromadb>=0.6.3"] + semantic-kernel-core = [ "semantic-kernel>=1.17.1", ] @@ -142,7 +144,7 @@ dev = [ [tool.ruff] extend = "../../pyproject.toml" include = ["src/**", "tests/*.py"] -exclude = ["src/autogen_ext/agents/web_surfer/*.js", "src/autogen_ext/runtimes/grpc/protos", "tests/protos"] +exclude = ["src/autogen_ext/agents/web_surfer/*.js", "src/autogen_ext/runtimes/grpc/protos", "tests/protos", "README.md"] [tool.pyright] extends = "../../pyproject.toml" diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/__init__.py b/python/packages/autogen-ext/src/autogen_ext/experimental/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/README.md b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/README.md new file mode 100644 index 000000000000..d4830547e036 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/README.md @@ -0,0 +1,210 @@ +# Task-Centric Memory +_(EXPERIMENTAL, RESEARCH IN PROGRESS)_ + +**Task-Centric Memory** is an active research project aimed at giving AI agents the ability to: + +* Accomplish general tasks more effectively by learning quickly and continually beyond context-window limitations. +* Remember guidance, corrections, plans, and demonstrations provided by users. +* Learn through the agent's own experience and adapt quickly to changing circumstances. +* Avoid repeating mistakes on tasks that are similar to those previously encountered. + +## Installation + +Install AutoGen and its extension package as follows: + +```bash +pip install -U "autogen-agentchat" "autogen-ext[openai]" "autogen-ext[task-centric-memory]" +``` + +## Quickstart + +

+ Description +

+ +This first code snippet runs a basic test to verify that the installation was successful, +as illustrated by the diagram to the right. + +```python +import asyncio +from autogen_ext.models.openai import OpenAIChatCompletionClient +from autogen_ext.experimental.task_centric_memory import MemoryController +from autogen_ext.experimental.task_centric_memory.utils import PageLogger + + +async def main() -> None: + client = OpenAIChatCompletionClient(model="gpt-4o") + logger = PageLogger(config={"level": "DEBUG", "path": "./pagelogs/quickstart"}) # Optional, but very useful. + memory_controller = MemoryController(reset=True, client=client, logger=logger) + + # Add a few task-insight pairs as memories, where an insight can be any string that may help solve the task. + await memory_controller.add_memo(task="What color do I like?", insight="Deep blue is my favorite color") + await memory_controller.add_memo(task="What's another color I like?", insight="I really like cyan") + await memory_controller.add_memo(task="What's my favorite food?", insight="Halibut is my favorite") + + # Retrieve memories for a new task that's related to only two of the stored memories. + memos = await memory_controller.retrieve_relevant_memos(task="What colors do I like most?") + print("{} memories retrieved".format(len(memos))) + for memo in memos: + print("- " + memo.insight) + + +asyncio.run(main()) +``` + +

+ Description +

+ +This second code example shows one way to incorporate task-centric memory directly into an AutoGen agent, +in this case a subclass of RoutedAgent. +To keep the code short, only the simplest form of memory retrieval is exercised by this agent. + +```python + +import asyncio +from dataclasses import dataclass +from typing import List + +from autogen_core import AgentId, MessageContext, RoutedAgent, SingleThreadedAgentRuntime, message_handler +from autogen_core.models import ChatCompletionClient, LLMMessage, SystemMessage, UserMessage +from autogen_ext.models.openai import OpenAIChatCompletionClient +from autogen_ext.experimental.task_centric_memory import MemoryController +from autogen_ext.experimental.task_centric_memory.utils import PageLogger + + +@dataclass +class Message: + content: str + + +class MemoryEnabledAgent(RoutedAgent): + def __init__( + self, description: str, model_client: ChatCompletionClient, memory_controller: MemoryController + ) -> None: + super().__init__(description) + self._model_client = model_client + self._memory_controller = memory_controller + + @message_handler + async def handle_message(self, message: Message, context: MessageContext) -> Message: + # Retrieve relevant memories for the task. + memos = await self._memory_controller.retrieve_relevant_memos(task=message.content) + + # Format the memories for the model. + formatted_memos = "Info that may be useful:\n" + "\n".join(["- " + memo.insight for memo in memos]) + print(f"{'-' * 23}Text appended to the user message{'-' * 24}\n{formatted_memos}\n{'-' * 80}") + + # Create the messages for the model with the retrieved memories. + messages: List[LLMMessage] = [ + SystemMessage(content="You are a helpful assistant."), + UserMessage(content=message.content, source="user"), + UserMessage(content=formatted_memos, source="user"), + ] + + # Call the model with the messages. + model_result = await self._model_client.create(messages=messages) + assert isinstance(model_result.content, str) + + # Send the model's response to the user. + return Message(content=model_result.content) + + +async def main() -> None: + client = OpenAIChatCompletionClient(model="gpt-4o") + logger = PageLogger(config={"level": "DEBUG", "path": "./pagelogs/quickstart2"}) # Optional, but very useful. + memory_controller = MemoryController(reset=True, client=client, logger=logger) + + # Prepopulate memory to mimic learning from a prior session. + await memory_controller.add_memo(task="What color do I like?", insight="Deep blue is my favorite color") + await memory_controller.add_memo(task="What's another color I like?", insight="I really like cyan") + await memory_controller.add_memo(task="What's my favorite food?", insight="Halibut is my favorite") + + # Create and start an agent runtime. + runtime = SingleThreadedAgentRuntime() + runtime.start() + + # Register the agent type. + await MemoryEnabledAgent.register( + runtime, + "memory_enabled_agent", + lambda: MemoryEnabledAgent( + "A agent with memory", model_client=client, memory_controller=memory_controller + ), + ) + + # Send a direct message to the agent. + request = "What colors do I like most?" + print("User request: " + request) + response = await runtime.send_message( + Message(content=request), AgentId("memory_enabled_agent", "default") + ) + print("Agent response: " + response.content) + + # Stop the agent runtime. + await runtime.stop() + + +asyncio.run(main()) +``` + +## Sample Code + +The example above modifies the agent's code. +But it's also possible to add task-centric memory to an agent or multi-agent team _without_ modifying any agent code. +See the [sample code](../../../../../../samples/task_centric_memory) for that and other forms of fast, memory-based learning. + + +## Architecture + +

+ Description +

+ +The block diagram to the right outlines the key components of the architecture in the most general form. +The memory components are shown in blue, and the green blocks represent external components. + +The **Memory Controller** implements the fast-learning methods described below, +and manages communication with a **Memory Bank** containing a vector DB and associated structures. + +The **Agent or Team** is the AI agent or team of agents to which memory is being added. +The sample code shows how to add task-centric memory to a simple AssistantAgent or a MagenticOneGroupChat team. + +The **Apprentice, app, or service** represents the code that instantiates the agent and memory controller, +and routes information between them, effectively wrapping agent and memory into a combined component. +The term _Apprentice_ connotes that this combination uses memory to learn quickly on the job. +The Apprentice class is a minimal reference implementation provided as utility code for illustration and testing, +but most applications will use their own code instead of the Apprentice. + +## Memory Creation and Storage + +Each stored memory (called a _memo_) contains a text insight and (optionally) a task description. +The insight is intended to help the agent accomplish future tasks that are similar to a prior task. +The memory controller provides methods for different types of learning. +If the user provides advice for solving a given task, the advice is extracted by the model client and stored as an insight. +If the user demonstrates how to perform a task, +the task and demonstration are stored together as an insight used to solve similar but different tasks. +If the agent is given a task (free of side-effects) and some means of determining success or failure, +the memory controller repeats the following learning loop in the background some number of times: + +1. Test the agent on the task a few times to check for a failure. +2. If a failure is found, analyze the agent's response in order to: + 1. Diagnose the failure of reasoning or missing information, + 2. Phrase a general piece of advice, such as what a teacher might give to a student, + 3. Temporarily append this advice to the task description, + 4. Return to step 1. + 5. If some piece of advice succeeds in helping the agent solve the task a number of times, add the advice as an insight to memory. +3. For each insight to be stored in memory, an LLM is prompted to generate a set of free-form, multi-word topics related to the insight. Each topic is embedded to a fixed-length vector and stored in a vector DB mapping it to the topic’s related insight. + +## Memory Retrieval and Usage + +The memory controller provides methods for different types of memory retrieval. +When the agent is given a task, the following steps are performed by the controller: +1. The task is rephrased into a generalized form. +2. A set of free-form, multi-word query topics are generated from the generalized task. +3. A potentially large number of previously stored topics, those most similar to each query topic, are retrieved from the vector DB along with the insights they map to. +4. These candidate memos are filtered by the aggregate similarity of their stored topics to the query topics. +5. In the final filtering stage, an LLM is prompted to validate only those insights that seem potentially useful in solving the task at hand. + +Retrieved insights that pass the filtering steps are listed under a heading like +"Important insights that may help solve tasks like this", then appended to the task description before it is passed to the agent as usual. diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/__init__.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/__init__.py new file mode 100644 index 000000000000..f68a3197d9fc --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/__init__.py @@ -0,0 +1,3 @@ +from .memory_controller import MemoryController + +__all__ = ["MemoryController"] diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/_memory_bank.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/_memory_bank.py new file mode 100644 index 000000000000..62ba71b55cbb --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/_memory_bank.py @@ -0,0 +1,201 @@ +import os +import pickle +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, TypedDict + +from ._string_similarity_map import StringSimilarityMap +from .utils.page_logger import PageLogger + + +@dataclass +class Memo: + """ + Represents an atomic unit of memory that can be stored in a memory bank and later retrieved. + """ + + task: str | None # The task description, if any. + insight: str # A hint, solution, plan, or any other text that may help solve a similar task. + + +# Following the nested-config pattern, this TypedDict minimizes code changes by encapsulating +# the settings that change frequently, as when loading many settings from a single YAML file. +class MemoryBankConfig(TypedDict, total=False): + path: str + relevance_conversion_threshold: float + n_results: int + distance_threshold: int + + +class MemoryBank: + """ + Stores task-completion insights as memories in a vector DB for later retrieval. + + Args: + reset: True to clear the DB before starting. + config: An optional dict that can be used to override the following values: + + - path: The path to the directory where the memory bank files are stored. + - relevance_conversion_threshold: The threshold used to normalize relevance. + - n_results: The maximum number of most relevant results to return for any given topic. + - distance_threshold: The maximum string-pair distance for a memo to be retrieved. + + logger: An optional logger. If None, no logging will be performed. + """ + + def __init__( + self, + reset: bool, + config: MemoryBankConfig | None = None, + logger: PageLogger | None = None, + ) -> None: + if logger is None: + logger = PageLogger() # Nothing will be logged by this object. + self.logger = logger + self.logger.enter_function() + + # Apply default settings and any config overrides. + memory_dir_path = "./memory_bank/default" + self.relevance_conversion_threshold = 1.7 + self.n_results = 25 + self.distance_threshold = 100 + if config is not None: + memory_dir_path = config.get("path", memory_dir_path) + self.relevance_conversion_threshold = config.get( + "relevance_conversion_threshold", self.relevance_conversion_threshold + ) + self.n_results = config.get("n_results", self.n_results) + self.distance_threshold = config.get("distance_threshold", self.distance_threshold) + + memory_dir_path = os.path.expanduser(memory_dir_path) + self.logger.info("\nMEMORY BANK DIRECTORY {}".format(memory_dir_path)) + path_to_db_dir = os.path.join(memory_dir_path, "string_map") + self.path_to_dict = os.path.join(memory_dir_path, "uid_memo_dict.pkl") + + self.string_map = StringSimilarityMap(reset=reset, path_to_db_dir=path_to_db_dir, logger=self.logger) + + # Load or create the associated memo dict on disk. + self.uid_memo_dict: Dict[str, Memo] = {} + self.last_memo_id = 0 + if (not reset) and os.path.exists(self.path_to_dict): + self.logger.info("\nLOADING MEMOS FROM DISK at {}".format(self.path_to_dict)) + with open(self.path_to_dict, "rb") as f: + self.uid_memo_dict = pickle.load(f) + self.last_memo_id = len(self.uid_memo_dict) + self.logger.info("\n{} MEMOS LOADED".format(len(self.uid_memo_dict))) + + # Clear the DB if requested. + if reset: + self._reset_memos() + + self.logger.leave_function() + + def reset(self) -> None: + """ + Forces immediate deletion of all contents, in memory and on disk. + """ + self.string_map.reset_db() + self._reset_memos() + + def _reset_memos(self) -> None: + """ + Forces immediate deletion of the memos, in memory and on disk. + """ + self.logger.info("\nCLEARING MEMOS") + self.uid_memo_dict = {} + self.save_memos() + + def save_memos(self) -> None: + """ + Saves the current memo structures (possibly empty) to disk. + """ + self.string_map.save_string_pairs() + with open(self.path_to_dict, "wb") as file: + self.logger.info("\nSAVING MEMOS TO DISK at {}".format(self.path_to_dict)) + pickle.dump(self.uid_memo_dict, file) + + def contains_memos(self) -> bool: + """ + Returns True if the memory bank contains any memo. + """ + return len(self.uid_memo_dict) > 0 + + def _map_topics_to_memo(self, topics: List[str], memo_id: str, memo: Memo) -> None: + """ + Adds a mapping in the vec DB from each topic to the memo. + """ + self.logger.enter_function() + self.logger.info("\nINSIGHT\n{}".format(memo.insight)) + for topic in topics: + self.logger.info("\n TOPIC = {}".format(topic)) + self.string_map.add_input_output_pair(topic, memo_id) + self.uid_memo_dict[memo_id] = memo + self.save_memos() + self.logger.leave_function() + + def add_memo(self, insight_str: str, topics: List[str], task_str: Optional[str] = None) -> None: + """ + Adds an insight to the memory bank, given topics related to the insight, and optionally the task. + """ + self.logger.enter_function() + self.last_memo_id += 1 + id_str = str(self.last_memo_id) + insight = Memo(insight=insight_str, task=task_str) + self._map_topics_to_memo(topics, id_str, insight) + self.logger.leave_function() + + def add_task_with_solution(self, task: str, solution: str, topics: List[str]) -> None: + """ + Adds a task-solution pair to the memory bank, to be retrieved together later as a combined insight. + This is useful when the insight is a demonstration of how to solve a given type of task. + """ + self.logger.enter_function() + self.last_memo_id += 1 + id_str = str(self.last_memo_id) + # Prepend the insight to the task description for context. + insight_str = "Example task:\n\n{}\n\nExample solution:\n\n{}".format(task, solution) + memo = Memo(insight=insight_str, task=task) + self._map_topics_to_memo(topics, id_str, memo) + self.logger.leave_function() + + def get_relevant_memos(self, topics: List[str]) -> List[Memo]: + """ + Returns any memos from the memory bank that appear sufficiently relevant to the input topics. + """ + self.logger.enter_function() + + # Retrieve all topic matches, and gather them into a single list. + matches: List[Tuple[str, str, float]] = [] # Each match is a tuple: (topic, memo_id, distance) + for topic in topics: + matches.extend(self.string_map.get_related_string_pairs(topic, self.n_results, self.distance_threshold)) + + # Build a dict of memo-relevance pairs from the matches. + memo_relevance_dict: Dict[str, float] = {} + for match in matches: + relevance = self.relevance_conversion_threshold - match[2] + memo_id = match[1] + if memo_id in memo_relevance_dict: + memo_relevance_dict[memo_id] += relevance + else: + memo_relevance_dict[memo_id] = relevance + + # Log the details of all the retrieved memos. + self.logger.info("\n{} POTENTIALLY RELEVANT MEMOS".format(len(memo_relevance_dict))) + for memo_id, relevance in memo_relevance_dict.items(): + memo = self.uid_memo_dict[memo_id] + details = "" + if memo.task is not None: + details += "\n TASK: {}\n".format(memo.task) + details += "\n INSIGHT: {}\n\n RELEVANCE: {:.3f}\n".format(memo.insight, relevance) + self.logger.info(details) + + # Sort the memo-relevance pairs by relevance, in descending order. + memo_relevance_dict = dict(sorted(memo_relevance_dict.items(), key=lambda item: item[1], reverse=True)) + + # Compose the list of sufficiently relevant memos to return. + memo_list: List[Memo] = [] + for memo_id in memo_relevance_dict: + if memo_relevance_dict[memo_id] >= 0: + memo_list.append(self.uid_memo_dict[memo_id]) + + self.logger.leave_function() + return memo_list diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/_prompter.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/_prompter.py new file mode 100644 index 000000000000..2602feeb2c61 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/_prompter.py @@ -0,0 +1,287 @@ +import time +from typing import List, Union + +from autogen_core import Image +from autogen_core.models import ( + AssistantMessage, + ChatCompletionClient, + CreateResult, + LLMMessage, + SystemMessage, + UserMessage, +) + +from .utils._functions import UserContent +from .utils.page_logger import PageLogger + + +class Prompter: + """ + Centralizes most of the Apprentice prompts sent to the model client. + + Args: + client: The client to call the model. + logger: An optional logger. If None, no logging will be performed. + """ + + def __init__(self, client: ChatCompletionClient, logger: PageLogger | None = None) -> None: + if logger is None: + logger = PageLogger() # Nothing will be logged by this object. + self.logger = logger + + self.client = client + self.default_system_message_content = "You are a helpful assistant." + self.time_spent_in_model_calls = 0.0 + self.num_model_calls = 0 + self.start_time = time.time() + + # Create the chat history + self._chat_history: List[LLMMessage] = [] + + async def call_model( + self, + summary: str, + user_content: UserContent, + system_message_content: str | None = None, + keep_these_messages: bool = True, + ) -> str: + """ + Calls the model client with the given input and returns the response. + """ + # Prepare the input message list + if system_message_content is None: + system_message_content = self.default_system_message_content + system_message: LLMMessage + if self.client.model_info["family"] == "o1": + # No system message allowed, so pass it as the first user message. + system_message = UserMessage(content=system_message_content, source="User") + else: + # System message allowed. + system_message = SystemMessage(content=system_message_content) + + user_message = UserMessage(content=user_content, source="User") + input_messages = [system_message] + self._chat_history + [user_message] + + # Double check the types of the input messages. + for message in input_messages: + for part in message.content: + assert isinstance(part, str) or isinstance(part, Image), "Invalid message content type: {}".format( + type(part) + ) + + # Call the model + start_time = time.time() + response = await self.client.create(input_messages) + assert isinstance(response, CreateResult) + response_string = response.content + assert isinstance(response_string, str) + response_message = AssistantMessage(content=response_string, source="Assistant") + assert isinstance(response_message, AssistantMessage) + self.time_spent_in_model_calls += time.time() - start_time + self.num_model_calls += 1 + + # Log the model call + self.logger.log_model_call(summary=summary, input_messages=input_messages, response=response) + + # Manage the chat history + if keep_these_messages: + self._chat_history.append(user_message) + self._chat_history.append(response_message) + + # Return the response as a string for now + return response_string + + def _clear_history(self) -> None: + """ + Empties the message list containing the chat history. + """ + self._chat_history = [] + + async def learn_from_failure( + self, task_description: str, memory_section: str, final_response: str, expected_answer: str, work_history: str + ) -> str: + """ + Tries to create an insight to help avoid the given failure in the future. + """ + sys_message = """- You are a patient and thorough teacher. +- Your job is to review work done by students and help them learn how to do better.""" + + user_message: List[Union[str, Image]] = [] + user_message.append("# A team of students made a mistake on the following task:\n") + user_message.extend([task_description]) + + if len(memory_section) > 0: + user_message.append(memory_section) + + user_message.append("# Here's the expected answer, which would have been correct:\n") + user_message.append(expected_answer) + + user_message.append("# Here is the students' answer, which was INCORRECT:\n") + user_message.append(final_response) + + user_message.append("# Please review the students' work which follows:\n") + user_message.append("**----- START OF STUDENTS' WORK -----**\n\n") + user_message.append(work_history) + user_message.append("\n**----- END OF STUDENTS' WORK -----**\n\n") + + user_message.append( + "# Now carefully review the students' work above, explaining in detail what the students did right and what they did wrong.\n" + ) + + self._clear_history() + await self.call_model( + summary="Ask the model to learn from this failure", + system_message_content=sys_message, + user_content=user_message, + ) + user_message = [ + "Now put yourself in the mind of the students. What misconception led them to their incorrect answer?" + ] + await self.call_model( + summary="Ask the model to state the misconception", + system_message_content=sys_message, + user_content=user_message, + ) + + user_message = [ + "Please express your key insights in the form of short, general advice that will be given to the students. Just one or two sentences, or they won't bother to read it." + ] + insight = await self.call_model( + summary="Ask the model to formulate a concise insight", + system_message_content=sys_message, + user_content=user_message, + ) + return insight + + async def find_index_topics(self, input_string: str) -> List[str]: + """ + Returns a list of topics related to the given string. + """ + sys_message = """You are an expert at semantic analysis.""" + + user_message: List[Union[str, Image]] = [] + user_message.append("""- My job is to create a thorough index for a book called Task Completion, and I need your help. +- Every paragraph in the book needs to be indexed by all the topics related to various kinds of tasks and strategies for completing them. +- Your job is to read the text below and extract the task-completion topics that are covered. +- The number of topics depends on the length and content of the text. But you should list at least one topic, and potentially many more. +- Each topic you list should be a meaningful phrase composed of a few words. Don't use whole sentences as topics. +- Don't include details that are unrelated to the general nature of the task, or a potential strategy for completing tasks. +- List each topic on a separate line, without any extra text like numbering, or bullets, or any other formatting, because we don't want those things in the index of the book.\n\n""") + + user_message.append("# Text to be indexed\n") + user_message.append(input_string) + + self._clear_history() + topics = await self.call_model( + summary="Ask the model to extract topics", system_message_content=sys_message, user_content=user_message + ) + + # Parse the topics into a list. + topic_list: List[str] = [] + for line in topics.split("\n"): + if len(line) > 0: + topic_list.append(line) + + return topic_list + + async def generalize_task(self, task_description: str) -> str: + """ + Attempts to rewrite a task description in a more general form. + """ + + sys_message = """You are a helpful and thoughtful assistant.""" + + user_message: List[Union[str, Image]] = [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points." + ] + user_message.append("\n# Task description") + user_message.append(task_description) + + self._clear_history() + await self.call_model( + summary="Ask the model to rephrase the task in a list of important points", + system_message_content=sys_message, + user_content=user_message, + ) + + user_message = [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ] + await self.call_model( + summary="Ask the model to identify irrelevant points", + system_message_content=sys_message, + user_content=user_message, + ) + + user_message = [ + "Revise your original list to include only the most general terms, those that are critical to solving the task, removing any themes or descriptions that are not essential to the solution. Your final list may be shorter, but do not leave out any part of the task that is needed for solving the task. Do not add any additional commentary either before or after the list." + ] + generalized_task = await self.call_model( + summary="Ask the model to make a final list of general terms", + system_message_content=sys_message, + user_content=user_message, + ) + return generalized_task + + async def validate_insight(self, insight: str, task_description: str) -> bool: + """ + Judges whether the insight could help solve the task. + """ + + sys_message = """You are a helpful and thoughtful assistant.""" + + user_message: List[Union[str, Image]] = [ + """We have been given a potential insight that may or may not be useful for solving a given task. +- First review the following task. +- Then review the insight that follows, and consider whether it might help solve the given task. +- Do not attempt to actually solve the task. +- Reply with a single character, '1' if the insight may be useful, or '0' if it is not.""" + ] + user_message.append("\n# Task description") + user_message.append(task_description) + user_message.append("\n# Possibly useful insight") + user_message.append(insight) + self._clear_history() + response = await self.call_model( + summary="Ask the model to validate the insight", + system_message_content=sys_message, + user_content=user_message, + ) + return response == "1" + + async def extract_task(self, text: str) -> str | None: + """ + Returns a task found in the given text, or None if not found. + """ + sys_message = """You are a helpful and thoughtful assistant.""" + user_message: List[Union[str, Image]] = [ + """Does the following text contain a question or a some task we are being asked to perform? +- If so, please reply with the full question or task description, along with any supporting information, but without adding extra commentary or formatting. +- If the task is just to remember something, that doesn't count as a task, so don't include it. +- If there is no question or task in the text, simply write "None" with no punctuation.""" + ] + user_message.append("\n# Text to analyze") + user_message.append(text) + self._clear_history() + response = await self.call_model( + summary="Ask the model to extract a task", system_message_content=sys_message, user_content=user_message + ) + return response if response != "None" else None + + async def extract_advice(self, text: str) -> str | None: + """ + Returns advice from the given text, or None if not found. + """ + sys_message = """You are a helpful and thoughtful assistant.""" + user_message: List[Union[str, Image]] = [ + """Does the following text contain any information or advice that might be useful later? +- If so, please copy the information or advice, adding no extra commentary or formatting. +- If there is no potentially useful information or advice at all, simply write "None" with no punctuation.""" + ] + user_message.append("\n# Text to analyze") + user_message.append(text) + self._clear_history() + response = await self.call_model( + summary="Ask the model to extract advice", system_message_content=sys_message, user_content=user_message + ) + return response if response != "None" else None diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/_string_similarity_map.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/_string_similarity_map.py new file mode 100644 index 000000000000..1510c41bc13b --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/_string_similarity_map.py @@ -0,0 +1,124 @@ +import os +import pickle +from typing import Dict, List, Tuple, Union + +import chromadb +from chromadb.api.types import ( + QueryResult, +) +from chromadb.config import Settings + +from .utils.page_logger import PageLogger + + +class StringSimilarityMap: + """ + Provides storage and similarity-based retrieval of string pairs using a vector database. + Each DB entry is a pair of strings: an input string and an output string. + The input string is embedded and used as the retrieval key. + The output string can be anything, but it's typically used as a dict key. + Vector embeddings are currently supplied by Chroma's default Sentence Transformers. + + Args: + - reset: True to clear the DB immediately after creation. + - path_to_db_dir: Path to the directory where the DB is stored. + - logger: An optional logger. If None, no logging will be performed. + """ + + def __init__(self, reset: bool, path_to_db_dir: str, logger: PageLogger | None = None) -> None: + if logger is None: + logger = PageLogger() # Nothing will be logged by this object. + self.logger = logger + self.path_to_db_dir = path_to_db_dir + + # Load or create the vector DB on disk. + chromadb_settings = Settings( + anonymized_telemetry=False, allow_reset=True, is_persistent=True, persist_directory=path_to_db_dir + ) + self.db_client = chromadb.Client(chromadb_settings) + self.vec_db = self.db_client.create_collection("string-pairs", get_or_create=True) # The collection is the DB. + + # Load or create the associated string-pair dict on disk. + self.path_to_dict = os.path.join(path_to_db_dir, "uid_text_dict.pkl") + self.uid_text_dict: Dict[str, Tuple[str, str]] = {} + self.last_string_pair_id = 0 + if (not reset) and os.path.exists(self.path_to_dict): + self.logger.debug("\nLOADING STRING SIMILARITY MAP FROM DISK at {}".format(self.path_to_dict)) + with open(self.path_to_dict, "rb") as f: + self.uid_text_dict = pickle.load(f) + self.last_string_pair_id = len(self.uid_text_dict) + if len(self.uid_text_dict) > 0: + self.logger.debug("\n{} STRING PAIRS LOADED".format(len(self.uid_text_dict))) + self._log_string_pairs() + + # Clear the DB if requested. + if reset: + self.reset_db() + + def _log_string_pairs(self) -> None: + """ + Logs all string pairs currently in the map. + """ + self.logger.debug("LIST OF STRING PAIRS") + for uid, text in self.uid_text_dict.items(): + input_text, output_text = text + self.logger.debug(" ID: {}\n INPUT TEXT: {}\n OUTPUT TEXT: {}".format(uid, input_text, output_text)) + + def save_string_pairs(self) -> None: + """ + Saves the string-pair dict (self.uid_text_dict) to disk. + """ + self.logger.debug("\nSAVING STRING SIMILARITY MAP TO DISK at {}".format(self.path_to_dict)) + with open(self.path_to_dict, "wb") as file: + pickle.dump(self.uid_text_dict, file) + + def reset_db(self) -> None: + """ + Forces immediate deletion of the DB's contents, in memory and on disk. + """ + self.logger.debug("\nCLEARING STRING-PAIR MAP") + self.db_client.delete_collection("string-pairs") + self.vec_db = self.db_client.create_collection("string-pairs") + self.uid_text_dict = {} + self.save_string_pairs() + + def add_input_output_pair(self, input_text: str, output_text: str) -> None: + """ + Adds one input-output string pair to the DB. + """ + self.last_string_pair_id += 1 + self.vec_db.add(documents=[input_text], ids=[str(self.last_string_pair_id)]) + self.uid_text_dict[str(self.last_string_pair_id)] = input_text, output_text + self.logger.debug( + "\nINPUT-OUTPUT PAIR ADDED TO VECTOR DATABASE:\n ID\n {}\n INPUT\n {}\n OUTPUT\n {}\n".format( + self.last_string_pair_id, input_text, output_text + ) + ) + # self._log_string_pairs() # For deeper debugging, uncomment to log all string pairs after each addition. + + def get_related_string_pairs( + self, query_text: str, n_results: int, threshold: Union[int, float] + ) -> List[Tuple[str, str, float]]: + """ + Retrieves up to n string pairs that are related to the given query text within the specified distance threshold. + """ + string_pairs_with_distances: List[Tuple[str, str, float]] = [] + if n_results > len(self.uid_text_dict): + n_results = len(self.uid_text_dict) + if n_results > 0: + results: QueryResult = self.vec_db.query(query_texts=[query_text], n_results=n_results) + num_results = len(results["ids"][0]) + for i in range(num_results): + uid = results["ids"][0][i] + input_text = results["documents"][0][i] if results["documents"] else "" + distance = results["distances"][0][i] if results["distances"] else 0.0 + if distance < threshold: + input_text_2, output_text = self.uid_text_dict[uid] + assert input_text == input_text_2 + self.logger.debug( + "\nINPUT-OUTPUT PAIR RETRIEVED FROM VECTOR DATABASE:\n INPUT1\n {}\n OUTPUT\n {}\n DISTANCE\n {}".format( + input_text, output_text, distance + ) + ) + string_pairs_with_distances.append((input_text, output_text, distance)) + return string_pairs_with_distances diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/memory_controller.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/memory_controller.py new file mode 100644 index 000000000000..3a25f6ea18f9 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/memory_controller.py @@ -0,0 +1,442 @@ +from typing import TYPE_CHECKING, Awaitable, Callable, List, Tuple, TypedDict + +from autogen_core.models import ( + ChatCompletionClient, +) + +from ._memory_bank import Memo, MemoryBank +from ._prompter import Prompter + +if TYPE_CHECKING: + from ._memory_bank import MemoryBankConfig +from .utils.grader import Grader +from .utils.page_logger import PageLogger + + +# Following the nested-config pattern, this TypedDict minimizes code changes by encapsulating +# the settings that change frequently, as when loading many settings from a single YAML file. +class MemoryControllerConfig(TypedDict, total=False): + max_train_trials: int + max_test_trials: int + MemoryBank: "MemoryBankConfig" + + +class MemoryController: + """ + (EXPERIMENTAL, RESEARCH IN PROGRESS) + + Implements fast, memory-based learning, and manages the flow of information to and from a memory bank. + + Args: + reset: True to empty the memory bank before starting. + client: The model client to use internally. + task_assignment_callback: An optional callback used to assign a task to any agent managed by the caller. + config: An optional dict that can be used to override the following values: + + - max_train_trials: The maximum number of learning iterations to attempt when training on a task. + - max_test_trials: The total number of attempts made when testing for failure on a task. + - MemoryBank: A config dict passed to MemoryBank. + + logger: An optional logger. If None, a default logger will be created. + + Example: + + The `task-centric-memory` extra first needs to be installed: + + .. code-block:: bash + + pip install "autogen-ext[task-centric-memory]" + + The following code snippet shows how to use this class for the most basic storage and retrieval of memories.: + + .. code-block:: python + + import asyncio + from autogen_ext.models.openai import OpenAIChatCompletionClient + from autogen_ext.experimental.task_centric_memory import MemoryController + from autogen_ext.experimental.task_centric_memory.utils import PageLogger + + + async def main() -> None: + client = OpenAIChatCompletionClient(model="gpt-4o") + logger = PageLogger(config={"level": "DEBUG", "path": "./pagelogs/quickstart"}) # Optional, but very useful. + memory_controller = MemoryController(reset=True, client=client, logger=logger) + + # Add a few task-insight pairs as memories, where an insight can be any string that may help solve the task. + await memory_controller.add_memo(task="What color do I like?", insight="Deep blue is my favorite color") + await memory_controller.add_memo(task="What's another color I like?", insight="I really like cyan") + await memory_controller.add_memo(task="What's my favorite food?", insight="Halibut is my favorite") + + # Retrieve memories for a new task that's related to only two of the stored memories. + memos = await memory_controller.retrieve_relevant_memos(task="What colors do I like most?") + print("{} memories retrieved".format(len(memos))) + for memo in memos: + print("- " + memo.insight) + + + asyncio.run(main()) + """ + + def __init__( + self, + reset: bool, + client: ChatCompletionClient, + task_assignment_callback: Callable[[str], Awaitable[Tuple[str, str]]] | None = None, + config: MemoryControllerConfig | None = None, + logger: PageLogger | None = None, + ) -> None: + if logger is None: + logger = PageLogger({"level": "DEBUG"}) + self.logger = logger + self.logger.enter_function() + + # Apply default settings and any config overrides. + self.max_train_trials = 10 + self.max_test_trials = 3 + memory_bank_config = None + if config is not None: + self.max_train_trials = config.get("max_train_trials", self.max_train_trials) + self.max_test_trials = config.get("max_test_trials", self.max_test_trials) + memory_bank_config = config.get("MemoryBank", memory_bank_config) + + self.client = client + self.task_assignment_callback = task_assignment_callback + self.prompter = Prompter(client, logger) + self.memory_bank = MemoryBank(reset=reset, config=memory_bank_config, logger=logger) + self.grader = Grader(client, logger) + self.logger.leave_function() + + def reset_memory(self) -> None: + """ + Empties the memory bank in RAM and on disk. + """ + self.memory_bank.reset() + + async def train_on_task(self, task: str, expected_answer: str) -> None: + """ + Repeatedly assigns a task to the agent, and tries to learn from failures by creating useful insights as memories. + """ + self.logger.enter_function() + self.logger.info("Iterate on the task, possibly discovering a useful new insight.\n") + _, insight = await self._iterate_on_task(task, expected_answer) + if insight is None: + self.logger.info("No useful insight was discovered.\n") + else: + self.logger.info("A new insight was created:\n{}".format(insight)) + await self.add_memo(insight, task) + self.logger.leave_function() + + async def test_on_task(self, task: str, expected_answer: str, num_trials: int = 1) -> Tuple[str, int, int]: + """ + Assigns a task to the agent, along with any relevant memos retrieved from memory. + """ + self.logger.enter_function() + assert self.task_assignment_callback is not None + response = "" + num_successes = 0 + + for trial in range(num_trials): + self.logger.info("\n----- TRIAL {} -----\n".format(trial + 1)) + task_plus_insights = task + + # Try to retrieve any relevant memories from the DB. + filtered_memos = await self.retrieve_relevant_memos(task) + filtered_insights = [memo.insight for memo in filtered_memos] + if len(filtered_insights) > 0: + self.logger.info("Relevant insights were retrieved from memory.\n") + memory_section = self._format_memory_section(filtered_insights) + if len(memory_section) > 0: + task_plus_insights = task + "\n\n" + memory_section + + # Attempt to solve the task. + self.logger.info("Try to solve the task.\n") + response, _ = await self.task_assignment_callback(task_plus_insights) + + # Check if the response is correct. + response_is_correct, extracted_answer = await self.grader.is_response_correct( + task, response, expected_answer + ) + self.logger.info("Extracted answer: {}".format(extracted_answer)) + if response_is_correct: + self.logger.info("Answer is CORRECT.\n") + num_successes += 1 + else: + self.logger.info("Answer is INCORRECT.\n") + + # Calculate the success rate as a percentage, rounded to the nearest whole number. + self.logger.info("\nSuccess rate: {}%\n".format(round((num_successes / num_trials) * 100))) + self.logger.leave_function() + return response, num_successes, num_trials + + async def add_memo(self, insight: str, task: None | str = None, index_on_both: bool = True) -> None: + """ + Adds one insight to the memory bank, using the task (if provided) as context. + """ + self.logger.enter_function() + + generalized_task = "" + if task is not None: + self.logger.info("\nGIVEN TASK:") + self.logger.info(task) + # Generalize the task. + generalized_task = await self.prompter.generalize_task(task) + + self.logger.info("\nGIVEN INSIGHT:") + self.logger.info(insight) + + # Get a list of topics from the insight and the task (if provided). + if task is None: + text_to_index = insight + self.logger.info("\nTOPICS EXTRACTED FROM INSIGHT:") + else: + if index_on_both: + text_to_index = generalized_task.strip() + "\n(Hint: " + insight + ")" + self.logger.info("\nTOPICS EXTRACTED FROM TASK AND INSIGHT COMBINED:") + else: + text_to_index = task + self.logger.info("\nTOPICS EXTRACTED FROM TASK:") + + topics = await self.prompter.find_index_topics(text_to_index) + self.logger.info("\n".join(topics)) + self.logger.info("") + + # Add the insight to the memory bank. + self.memory_bank.add_memo(insight, topics, task) + self.logger.leave_function() + + async def add_task_solution_pair_to_memory(self, task: str, solution: str) -> None: + """ + Adds a task-solution pair to the memory bank, to be retrieved together later as a combined insight. + This is useful when the task-solution pair is an exemplar of solving a task related to some other task. + """ + self.logger.enter_function() + + self.logger.info("\nEXAMPLE TASK:") + self.logger.info(task) + + self.logger.info("\nEXAMPLE SOLUTION:") + self.logger.info(solution) + + # Get a list of topics from the task. + topics = await self.prompter.find_index_topics(task.strip()) + self.logger.info("\nTOPICS EXTRACTED FROM TASK:") + self.logger.info("\n".join(topics)) + self.logger.info("") + + # Add the task and solution (as a combined insight) to the memory bank. + self.memory_bank.add_task_with_solution(task=task, solution=solution, topics=topics) + self.logger.leave_function() + + async def retrieve_relevant_memos(self, task: str) -> List[Memo]: + """ + Retrieves any memos from memory that seem relevant to the task. + """ + self.logger.enter_function() + + if self.memory_bank.contains_memos(): + self.logger.info("\nCURRENT TASK:") + self.logger.info(task) + + # Get a list of topics from the generalized task. + generalized_task = await self.prompter.generalize_task(task) + task_topics = await self.prompter.find_index_topics(generalized_task) + self.logger.info("\nTOPICS EXTRACTED FROM TASK:") + self.logger.info("\n".join(task_topics)) + self.logger.info("") + + # Retrieve relevant memos from the memory bank. + memo_list = self.memory_bank.get_relevant_memos(topics=task_topics) + + # Apply a final validation stage to keep only the memos that the LLM concludes are sufficiently relevant. + validated_memos: List[Memo] = [] + for memo in memo_list: + if await self.prompter.validate_insight(memo.insight, task): + validated_memos.append(memo) + + self.logger.info("\n{} VALIDATED MEMOS".format(len(validated_memos))) + for memo in validated_memos: + if memo.task is not None: + self.logger.info("\n TASK: {}".format(memo.task)) + self.logger.info("\n INSIGHT: {}".format(memo.insight)) + else: + self.logger.info("\nNO SUFFICIENTLY RELEVANT MEMOS WERE FOUND IN MEMORY") + validated_memos = [] + + self.logger.leave_function() + return validated_memos + + def _format_memory_section(self, memories: List[str]) -> str: + """ + Formats a list of memories as a section for appending to a task description. + """ + memory_section = "" + if len(memories) > 0: + memory_section = "## Important insights that may help solve tasks like this\n" + for mem in memories: + memory_section += "- " + mem + "\n" + return memory_section + + async def _test_for_failure( + self, task: str, task_plus_insights: str, expected_answer: str + ) -> Tuple[bool, str, str]: + """ + Attempts to solve the given task multiple times to find a failure case to learn from. + """ + self.logger.enter_function() + self.logger.info("\nTask description, including any insights: {}".format(task_plus_insights)) + self.logger.info("\nExpected answer: {}\n".format(expected_answer)) + + assert self.task_assignment_callback is not None + failure_found = False + response, work_history = "", "" + + for trial in range(self.max_test_trials): + self.logger.info("\n----- TRIAL {} -----\n".format(trial + 1)) + + # Attempt to solve the task. + self.logger.info("Try to solve the task.") + response, work_history = await self.task_assignment_callback(task_plus_insights) + + response_is_correct, extracted_answer = await self.grader.is_response_correct( + task, response, expected_answer + ) + self.logger.info("Extracted answer: {}".format(extracted_answer)) + if response_is_correct: + self.logger.info("Answer is CORRECT.\n") + else: + self.logger.info("Answer is INCORRECT.\n Stop testing, and return the details of the failure.\n") + failure_found = True + break + + self.logger.leave_function() + return failure_found, response, work_history + + async def _iterate_on_task(self, task: str, expected_answer: str) -> Tuple[str, None | str]: + """ + Repeatedly assigns a task to the agent, and tries to learn from failures by creating useful insights as memories. + """ + self.logger.enter_function() + self.logger.info("\nTask description: {}".format(task)) + self.logger.info("\nExpected answer: {}\n".format(expected_answer)) + + final_response = "" + old_memos = await self.retrieve_relevant_memos(task) + old_insights = [memo.insight for memo in old_memos] + new_insights: List[str] = [] + last_insight = None + insight = None + successful_insight = None + + # Loop until success (or timeout) while learning from failures. + for trial in range(1, self.max_train_trials + 1): + self.logger.info("\n----- TRAIN TRIAL {} -----\n".format(trial)) + task_plus_insights = task + + # Add any new insights we've accumulated so far. + if last_insight is not None: + memory_section = self._format_memory_section(old_insights + [last_insight]) + else: + memory_section = self._format_memory_section(old_insights) + if len(memory_section) > 0: + task_plus_insights += "\n\n" + memory_section + + # Can we find a failure case to learn from? + failure_found, response, work_history = await self._test_for_failure( + task, task_plus_insights, expected_answer + ) + if not failure_found: + # No. Time to exit the loop. + self.logger.info("\nResponse is CORRECT.\n Stop looking for insights.\n") + # Was this the first trial? + if trial == 1: + # Yes. We should return the successful response, and no insight. + final_response = response + else: + # No. We learned a successful insight, which should be returned. + successful_insight = insight + break + + # Will we try again? + if trial == self.max_train_trials: + # No. We're out of training trials. + self.logger.info("\nNo more trials will be attempted.\n") + break + + # Try to learn from this failure. + self.logger.info("\nResponse is INCORRECT. Try to learn from this failure.\n") + insight = await self.prompter.learn_from_failure( + task, memory_section, response, expected_answer, work_history + ) + self.logger.info("\nInsight: {}\n".format(insight)) + new_insights.append(insight) + last_insight = insight + + # Return the answer from the last loop. + self.logger.info("\n{}\n".format(final_response)) + self.logger.leave_function() + return final_response, successful_insight + + async def _append_any_relevant_memories(self, task: str) -> str: + """ + Appends any relevant memories to the task description. + """ + self.logger.enter_function() + + filtered_memos = await self.retrieve_relevant_memos(task) + filtered_insights = [memo.insight for memo in filtered_memos] + if len(filtered_insights) > 0: + self.logger.info("Relevant insights were retrieved from memory.\n") + memory_section = self._format_memory_section(filtered_insights) + if len(memory_section) > 0: + task = task + "\n\n" + memory_section + + self.logger.leave_function() + return task + + async def assign_task(self, task: str, use_memory: bool = True, should_await: bool = True) -> str: + """ + Assigns a task to some agent through the task_assignment_callback, along with any relevant memories. + """ + self.logger.enter_function() + + assert self.task_assignment_callback is not None + + if use_memory: + task = await self._append_any_relevant_memories(task) + + # Attempt to solve the task. + self.logger.info("Try to solve the task.\n") + assert should_await + response, _ = await self.task_assignment_callback(task) + + self.logger.leave_function() + return response + + async def consider_memo_storage(self, text: str) -> str | None: + """ + Tries to extract any advice from the given text and add it to memory. + """ + self.logger.enter_function() + + advice = await self.prompter.extract_advice(text) + self.logger.info("Advice: {}".format(advice)) + if advice is not None: + await self.add_memo(insight=advice) + + self.logger.leave_function() + return advice + + async def handle_user_message(self, text: str, should_await: bool = True) -> str: + """ + Handles a user message by extracting any advice as an insight to be stored in memory, and then calling assign_task(). + """ + self.logger.enter_function() + + # Check for advice. + advice = await self.consider_memo_storage(text) + + # Assign the task through the task_assignment_callback, using memory only if no advice was just provided. + response = await self.assign_task(text, use_memory=(advice is None), should_await=should_await) + + self.logger.leave_function() + return response diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/__init__.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/__init__.py new file mode 100644 index 000000000000..992e0e8ef1c0 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/__init__.py @@ -0,0 +1,7 @@ +from .apprentice import Apprentice +from .chat_completion_client_recorder import ChatCompletionClientRecorder +from .grader import Grader +from .page_logger import PageLogger +from .teachability import Teachability + +__all__ = ["Apprentice", "ChatCompletionClientRecorder", "Grader", "PageLogger", "Teachability"] diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/_functions.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/_functions.py new file mode 100644 index 000000000000..f1c9aed6f41c --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/_functions.py @@ -0,0 +1,96 @@ +import hashlib +import os +from typing import List, Tuple, Union + +from autogen_core import FunctionCall, Image +from autogen_core.models import FunctionExecutionResult + +# Convenience types +UserContent = Union[str, List[Union[str, Image]]] +AssistantContent = Union[str, List[FunctionCall]] +FunctionExecutionContent = List[FunctionExecutionResult] +SystemContent = str +MessageContent = UserContent | AssistantContent | SystemContent | FunctionExecutionContent + + +def message_content_to_str(message_content: MessageContent | None) -> str: + """ + Converts the message content to a string. + """ + if message_content is None: + return "" + elif isinstance(message_content, str): + return message_content + elif isinstance(message_content, List): + converted: List[str] = list() + for item in message_content: + if isinstance(item, str): + converted.append(item) + elif isinstance(item, Image): + converted.append("") + else: + converted.append(str(item).rstrip()) + return "\n".join(converted) + else: + raise AssertionError("Unexpected response type.") + + +def text_from_user_content(user_content: UserContent) -> str: + """ + Extracts just the text from the user content. + """ + if isinstance(user_content, str): + return user_content + elif isinstance(user_content, List): + text_list: List[str] = list() + for item in user_content: + if isinstance(item, str): + text_list.append(item.rstrip()) + return "\n\n".join(text_list) + else: + raise AssertionError("Unexpected response type.") + + +def single_image_from_user_content(user_content: UserContent) -> Union[Image, None]: + """ + Extracts a single image from the user content. + """ + image_to_return = None + if isinstance(user_content, str): + return None + elif isinstance(user_content, List): + for item in user_content: + if isinstance(item, Image): + assert image_to_return is None, "Only one image is currently allowed in the user content." + image_to_return = item + else: + raise AssertionError("Unexpected response type.") + return image_to_return + + +def hash_directory(directory: str, hash_algo: str = "sha256") -> Tuple[str, int, int]: + """Computes a hash representing the state of a directory, including its structure and file contents.""" + hash_func = hashlib.new(hash_algo) + + # Also count the number of files and sub-directories + num_files = 0 + num_subdirs = 0 + + for root, dirs, files in sorted(os.walk(directory)): # Ensure order for consistent hashing + num_files += len(files) + num_subdirs += len(dirs) + for dir_name in sorted(dirs): + hash_func.update(dir_name.encode()) # Hash directory names + + for file_name in sorted(files): + file_path = os.path.join(root, file_name) + hash_func.update(file_name.encode()) # Hash file names + + try: + with open(file_path, "rb") as f: + while chunk := f.read(4096): # Read in chunks + hash_func.update(chunk) + except Exception: + pass + + return hash_func.hexdigest(), num_files, num_subdirs diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/apprentice.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/apprentice.py new file mode 100644 index 000000000000..a8104c0ebc44 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/apprentice.py @@ -0,0 +1,262 @@ +import random +import time +from typing import TYPE_CHECKING, Any, List, Sequence, Tuple, TypedDict + +from autogen_agentchat.agents import AssistantAgent +from autogen_agentchat.base import TaskResult +from autogen_agentchat.messages import AgentEvent, ChatMessage, TextMessage +from autogen_core.models import ( + ChatCompletionClient, + LLMMessage, + SystemMessage, + UserMessage, +) + +from .page_logger import PageLogger + +if TYPE_CHECKING: + from ..memory_controller import MemoryControllerConfig + + +# Following the nested-config pattern, this TypedDict minimizes code changes by encapsulating +# the settings that change frequently, as when loading many settings from a single YAML file. +class ApprenticeConfig(TypedDict, total=False): + name_of_agent_or_team: str + disable_prefix_caching: bool + MemoryController: "MemoryControllerConfig" + + +class Apprentice: + """ + A minimal wrapper combining task-centric memory with an agent or team. + Applications may use the Apprentice class, or they may directly instantiate + and call the Memory Controller using this class as an example. + + Args: + client: The client to call the model. + config: An optional dict that can be used to override the following values: + + - name_of_agent_or_team: The name of the target agent or team for assigning tasks to. + - disable_prefix_caching: True to disable prefix caching by prepending random ints to the first message. + - MemoryController: A config dict passed to MemoryController. + + logger: An optional logger. If None, a default logger will be created. + """ + + def __init__( + self, + client: ChatCompletionClient, + config: ApprenticeConfig | None = None, + logger: PageLogger | None = None, + ) -> None: + if logger is None: + logger = PageLogger({"level": "DEBUG"}) + self.logger = logger + + # Apply default settings and any config overrides. + self.name_of_agent_or_team = "AssistantAgent" + self.disable_prefix_caching = False + memory_controller_config = None + if config is not None: + self.name_of_agent_or_team = config.get("name_of_agent_or_team", self.name_of_agent_or_team) + self.disable_prefix_caching = config.get("disable_prefix_caching", self.disable_prefix_caching) + memory_controller_config = config.get("MemoryController", memory_controller_config) + + self.client = client + if self.disable_prefix_caching: + self.rand = random.Random() + self.rand.seed(int(time.time() * 1000)) + + # Create the MemoryController, which creates the MemoryBank. + from ..memory_controller import MemoryController + + self.memory_controller = MemoryController( + reset=True, + client=self.client, + task_assignment_callback=self.assign_task_to_agent_or_team, + config=memory_controller_config, + logger=self.logger, + ) + + def reset_memory(self) -> None: + """ + Resets the memory bank. + """ + self.memory_controller.reset_memory() + + async def handle_user_message(self, text: str, should_await: bool = True) -> str: + """ + Handles a user message, extracting any advice and assigning a task to the agent. + """ + self.logger.enter_function() + + # Pass the user message through to the memory controller. + response = await self.memory_controller.handle_user_message(text, should_await) + + self.logger.leave_function() + return response + + async def add_task_solution_pair_to_memory(self, task: str, solution: str) -> None: + """ + Adds a task-solution pair to the memory bank, to be retrieved together later as a combined insight. + This is useful when the insight is a demonstration of how to solve a given type of task. + """ + self.logger.enter_function() + + # Pass the task and solution through to the memory controller. + await self.memory_controller.add_task_solution_pair_to_memory(task, solution) + + self.logger.leave_function() + + async def assign_task(self, task: str, use_memory: bool = True, should_await: bool = True) -> str: + """ + Assigns a task to the agent, along with any relevant insights/memories. + """ + self.logger.enter_function() + + # Pass the task through to the memory controller. + response = await self.memory_controller.assign_task(task, use_memory, should_await) + + self.logger.leave_function() + return response + + async def train_on_task(self, task: str, expected_answer: str) -> None: + """ + Repeatedly assigns a task to the completion agent, and tries to learn from failures by creating useful insights as memories. + """ + self.logger.enter_function() + + # Pass the task through to the memory controller. + await self.memory_controller.train_on_task(task, expected_answer) + + self.logger.leave_function() + + async def assign_task_to_agent_or_team(self, task: str) -> Tuple[str, str]: + """ + Passes the given task to the target agent or team. + """ + self.logger.enter_function() + + # Pass the task through. + if self.name_of_agent_or_team == "MagenticOneGroupChat": + response, work_history = await self._assign_task_to_magentic_one(task) + elif self.name_of_agent_or_team == "AssistantAgent": + response, work_history = await self._assign_task_to_assistant_agent(task) + else: + raise AssertionError("Invalid base agent") + + self.logger.leave_function() + return response, work_history + + async def _assign_task_to_assistant_agent(self, task: str) -> Tuple[Any, Any]: + """ + Passes the given task to a newly created AssistantAgent with a generic 6-step system prompt. + """ + self.logger.enter_function() + self.logger.info(task) + + system_message_content = """You are a helpful and thoughtful assistant. +In responding to every user message, you follow the same multi-step process given here: +1. Explain your understanding of the user message in detail, covering all the important points. +2. List as many possible responses as you can think of. +3. Carefully list and weigh the pros and cons (if any) of each possible response. +4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist. +5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory. +6. Finish by providing your final response in the particular format requested by the user.""" + + if self.disable_prefix_caching: + # Prepend a random int to disable prefix caching. + random_str = "({})\n\n".format(self.rand.randint(0, 1000000)) + system_message_content = random_str + system_message_content + + system_message: LLMMessage + if self.client.model_info["family"] == "o1": + # No system message allowed, so pass it as the first user message. + system_message = UserMessage(content=system_message_content, source="User") + else: + # System message allowed. + system_message = SystemMessage(content=system_message_content) + + user_message: LLMMessage = UserMessage(content=task, source="User") + system_message_list: List[LLMMessage] = [system_message] + user_message_list: List[LLMMessage] = [user_message] + input_messages: List[LLMMessage] = system_message_list + user_message_list + + assistant_agent = AssistantAgent( + "assistant_agent", + self.client, + system_message=system_message_content, + ) + + # Get the agent's response to the task. + task_result: TaskResult = await assistant_agent.run(task=TextMessage(content=task, source="User")) + messages: Sequence[AgentEvent | ChatMessage] = task_result.messages + message: AgentEvent | ChatMessage = messages[-1] + response_str = message.content + + # Log the model call + self.logger.log_model_task( + summary="Ask the model to complete the task", input_messages=input_messages, task_result=task_result + ) + self.logger.info("\n----- RESPONSE -----\n\n{}\n".format(response_str)) + + # Use the response as the work history as well. + work_history = response_str + + self.logger.leave_function() + return response_str, work_history + + async def _assign_task_to_magentic_one(self, task: str) -> Tuple[str, str]: + """ + Instantiates a MagenticOneGroupChat team, and passes the given task to it. + """ + self.logger.enter_function() + self.logger.info(task) + + general_agent = AssistantAgent( + "general_agent", + self.client, + description="A general GPT-4o AI assistant capable of performing a variety of tasks.", + ) + + from autogen_ext.agents.web_surfer import MultimodalWebSurfer + + web_surfer = MultimodalWebSurfer( + name="web_surfer", + model_client=self.client, + downloads_folder="logs", + debug_dir="logs", + to_save_screenshots=True, + ) + + from autogen_agentchat.teams import MagenticOneGroupChat + + team = MagenticOneGroupChat( + [general_agent, web_surfer], + model_client=self.client, + max_turns=20, + ) + + # Get the team's response to the task. + task_result: TaskResult = await team.run(task=task) + + assert isinstance(task_result, TaskResult) + messages = task_result.messages + + response_str_list: List[str] = [] + for message in messages: + content = message.content + if isinstance(content, str): + content_str = content + else: + content_str = "Not a string." + response_str_list.append(content_str) + response_str = "\n".join(response_str_list) + + self.logger.info("\n----- RESPONSE -----\n\n{}\n".format(response_str)) + + # MagenticOne's response is the chat history, which we use here as the work history. + work_history = response_str + + self.logger.leave_function() + return response_str, work_history diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/chat_completion_client_recorder.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/chat_completion_client_recorder.py new file mode 100644 index 000000000000..3e722e103c22 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/chat_completion_client_recorder.py @@ -0,0 +1,220 @@ +import json +import os +import warnings +from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Sequence, TypedDict, Union + +from autogen_core import CancellationToken +from autogen_core.models import ( + ChatCompletionClient, + CreateResult, + LLMMessage, + ModelCapabilities, # type: ignore + ModelInfo, + RequestUsage, +) +from autogen_core.tools import Tool, ToolSchema + +from .page_logger import PageLogger + + +class RecordDict(TypedDict): + mode: Literal["create", "create_stream"] + messages: List[Mapping[str, Any]] + response: Dict[str, Any] + stream: List[Mapping[str, Any]] + + +class ChatCompletionClientRecorder(ChatCompletionClient): + """ + A chat completion client that supports fast, large-scale tests of code calling LLM clients. + + Two modes are supported: + + 1. "record": delegates to the underlying client while also recording the input messages and responses, + which are saved to disk when finalize() is called. + 2. "replay": loads previously recorded message and responses from disk, then on each call + checks that its message matches the recorded message, and returns the recorded response. + + The recorded data is stored as a JSON list of records. Each record is a dictionary with a "mode" + field (either "create" or "create_stream"), a serialized list of messages, and either a "response" (for + create calls) or a "stream" (a list of streamed outputs for create_stream calls). + + ReplayChatCompletionClient and ChatCompletionCache do similar things, but with significant differences: + - ReplayChatCompletionClient replays pre-defined responses in a specified order + without recording anything or checking the messages sent to the client. + - ChatCompletionCache caches responses and replays them for messages that have been seen before, + regardless of order, and calls the base client for any uncached messages. + """ + + def __init__( + self, + client: ChatCompletionClient, + mode: Literal["record", "replay"], + session_file_path: str, + logger: PageLogger | None = None, + ) -> None: + if logger is None: + self.logger = PageLogger() # Disabled by default. + else: + self.logger = logger + self.logger.enter_function() + self.logger.info("Wrapping the base client in ChatCompletionClientRecorder.") + + self.base_client = client + self.mode = mode + self.session_file_path = os.path.expanduser(session_file_path) + self.records: List[RecordDict] = [] + self._record_index = 0 + self._num_checked_records = 0 + if self.mode == "record": + # Prepare to record the messages and responses. + self.logger.info("Recording mode enabled.\nRecording session to: " + self.session_file_path) + elif self.mode == "replay": + # Load the previously recorded messages and responses from disk. + self.logger.info("Replay mode enabled.\nRetrieving session from: " + self.session_file_path) + try: + with open(self.session_file_path, "r") as f: + self.records = json.load(f) + except Exception as e: + error_str = f"\nFailed to load recorded session: '{self.session_file_path}': {e}" + self.logger.error(error_str) + raise ValueError(error_str) from e + + self.logger.leave_function() + + async def create( + self, + messages: Sequence[LLMMessage], + *, + tools: Sequence[Tool | ToolSchema] = [], + json_output: Optional[bool] = None, + extra_create_args: Mapping[str, Any] = {}, + cancellation_token: Optional[CancellationToken] = None, + ) -> CreateResult: + current_messages: List[Mapping[str, Any]] = [msg.model_dump() for msg in messages] + if self.mode == "record": + response = await self.base_client.create( + messages, + tools=tools, + json_output=json_output, + extra_create_args=extra_create_args, + cancellation_token=cancellation_token, + ) + + rec: RecordDict = { + "mode": "create", + "messages": current_messages, + "response": response.model_dump(), + "stream": [], + } + self.records.append(rec) + return response + elif self.mode == "replay": + if self._record_index >= len(self.records): + error_str = "\nNo more recorded turns to check." + self.logger.error(error_str) + raise ValueError(error_str) + rec = self.records[self._record_index] + if rec.get("mode") != "create": + error_str = f"\nRecorded call type mismatch at index {self._record_index}: expected 'create', got '{rec.get('mode')}'." + self.logger.error(error_str) + raise ValueError(error_str) + recorded_messages = rec.get("messages") + if recorded_messages != current_messages: + error_str = ( + "\nCurrent message list doesn't match the recorded message list. See the pagelogs for details." + ) + assert recorded_messages is not None + self.logger.log_dict_list(recorded_messages, "recorded message list") + assert current_messages is not None + self.logger.log_dict_list(current_messages, "current message list") + self.logger.error(error_str) + raise ValueError(error_str) + self._record_index += 1 + self._num_checked_records += 1 + + data = rec.get("response") + # Populate a CreateResult from the data. + assert data is not None + result = CreateResult( + content=data.get("content", ""), + finish_reason=data.get("finish_reason", "stop"), + usage=data.get("usage", RequestUsage(prompt_tokens=0, completion_tokens=0)), + cached=True, + ) + return result + + else: + error_str = f"\nUnknown mode: {self.mode}" + self.logger.error(error_str) + raise ValueError(error_str) + + def create_stream( + self, + messages: Sequence[LLMMessage], + *, + tools: Sequence[Tool | ToolSchema] = [], + json_output: Optional[bool] = None, + extra_create_args: Mapping[str, Any] = {}, + cancellation_token: Optional[CancellationToken] = None, + ) -> AsyncGenerator[Union[str, CreateResult], None]: + return self.base_client.create_stream( + messages, + tools=tools, + json_output=json_output, + extra_create_args=extra_create_args, + cancellation_token=cancellation_token, + ) + + def actual_usage(self) -> RequestUsage: + # Calls base_client.actual_usage() and returns the result. + return self.base_client.actual_usage() + + def total_usage(self) -> RequestUsage: + # Calls base_client.total_usage() and returns the result. + return self.base_client.total_usage() + + def count_tokens(self, messages: Sequence[LLMMessage], *, tools: Sequence[Tool | ToolSchema] = []) -> int: + # Calls base_client.count_tokens() and returns the result. + return self.base_client.count_tokens(messages, tools=tools) + + def remaining_tokens(self, messages: Sequence[LLMMessage], *, tools: Sequence[Tool | ToolSchema] = []) -> int: + # Calls base_client.remaining_tokens() and returns the result. + return self.base_client.remaining_tokens(messages, tools=tools) + + @property + def capabilities(self) -> ModelCapabilities: # type: ignore + # Calls base_client.capabilities and returns the result. + warnings.warn("capabilities is deprecated, use model_info instead", DeprecationWarning, stacklevel=2) + return self.base_client.capabilities + + @property + def model_info(self) -> ModelInfo: + # Calls base_client.model_info and returns the result. + return self.base_client.model_info + + def finalize(self) -> None: + """ + In record mode, saves the accumulated records to disk. + In replay mode, makes sure all the records were checked. + """ + self.logger.enter_function() + if self.mode == "record": + try: + # Create the directory if it doesn't exist. + os.makedirs(os.path.dirname(self.session_file_path), exist_ok=True) + # Write the records to disk. + with open(self.session_file_path, "w") as f: + json.dump(self.records, f, indent=2) + self.logger.info("\nRecorded session was saved to: " + self.session_file_path) + except Exception as e: + error_str = f"Failed to write records to '{self.session_file_path}': {e}" + self.logger.error(error_str) + raise ValueError(error_str) from e + elif self.mode == "replay": + if self._num_checked_records < len(self.records): + error_str = f"\nEarly termination. Only {self._num_checked_records} of the {len(self.records)} recorded turns were checked." + self.logger.error(error_str) + raise ValueError(error_str) + self.logger.info("\nRecorded session was fully replayed and checked.") + self.logger.leave_function() diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/grader.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/grader.py new file mode 100644 index 000000000000..ffe679a1d160 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/grader.py @@ -0,0 +1,179 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Tuple, Union + +from autogen_core import Image +from autogen_core.models import ( + AssistantMessage, + ChatCompletionClient, + CreateResult, + LLMMessage, + SystemMessage, + UserMessage, +) + +from ._functions import UserContent +from .page_logger import PageLogger + +if TYPE_CHECKING: + from .apprentice import Apprentice + + +class Grader: + """ + Runs basic tests, and determines task success without limitation to string matches. + + Args: + client: The client to call the model. + logger: An optional logger. If None, no logging will be performed. + """ + + def __init__(self, client: ChatCompletionClient, logger: PageLogger | None = None) -> None: + if logger is None: + logger = PageLogger() # Nothing will be logged by this object. + self.logger = logger + self.client = client + + # Create the chat history + self._chat_history: List[LLMMessage] = [] + + async def test_apprentice( + self, + apprentice: Apprentice, + task_description: str, + expected_answer: str, + num_trials: int, + use_memory: bool, + client: ChatCompletionClient, + ) -> Tuple[int, int]: + self.logger.enter_function() + + self.logger.info("Testing the apprentice on the given task.\n") + + num_successes = 0 + + for trial in range(num_trials): + self.logger.info("\n----- TRIAL {} -----\n".format(trial + 1)) + self.logger.info("Try to solve the task.\n") + response = await apprentice.assign_task(task_description, use_memory=use_memory) + response_is_correct, extracted_answer = await self.is_response_correct( + task_description, response, expected_answer + ) + self.logger.info("Extracted answer: {}".format(extracted_answer)) + if response_is_correct: + self.logger.info("Answer is CORRECT.\n") + num_successes += 1 + else: + self.logger.info("Answer is INCORRECT.\n") + + self.logger.info("\nSuccess rate: {}%\n".format(round((num_successes / num_trials) * 100))) + self.logger.leave_function() + return num_successes, num_trials + + async def call_model( + self, + summary: str, + user_content: UserContent, + system_message_content: str | None = None, + keep_these_messages: bool = True, + ) -> str: + """ + Calls the model client with the given input and returns the response. + """ + # Prepare the input message list + if system_message_content is None: + system_message_content = "You are a helpful assistant." + system_message: LLMMessage + if self.client.model_info["family"] == "o1": + # No system message allowed, so pass it as the first user message. + system_message = UserMessage(content=system_message_content, source="User") + else: + # System message allowed. + system_message = SystemMessage(content=system_message_content) + user_message = UserMessage(content=user_content, source="User") + input_messages = [system_message] + self._chat_history + [user_message] + + # Call the model. + response = await self.client.create(input_messages) + assert isinstance(response, CreateResult) + response_string = response.content + assert isinstance(response_string, str) + response_message = AssistantMessage(content=response_string, source="Assistant") + assert isinstance(response_message, AssistantMessage) + + # Log the model call + self.logger.log_model_call(summary=summary, input_messages=input_messages, response=response) + + # Manage the chat history + if keep_these_messages: + self._chat_history.append(user_message) + self._chat_history.append(response_message) + + # Return the response as a string + return response_string + + def _clear_history(self) -> None: + """ + Empties the message list containing the chat history. + """ + self._chat_history = [] + + async def is_response_correct( + self, task_description: str, response_to_be_graded: str, correct_answer: str + ) -> Tuple[bool, str]: + """ + Determines whether the response is equivalent to the task's correct answer. + """ + self.logger.enter_function() + + sys_message = """You are a helpful and thoughtful assistant.""" + + # Ask the model to extract the answer from the response. + user_message: List[Union[str, Image]] = [] + user_message.append("""Your job is to extract a possible answer to the following question from the given text. +- First review the following task. +- Then review the text that follows, which may an answer, plus reasoning that led to the answer. +- Do not attempt to actually solve the task yourself. +- Don't try to judge whether the reasoning steps were correct. +- Simply respond by summarizing the answer described in the text, omitting any other parts of the text. +- If no answer is present can be extracted from the text, simply reply "None".""") + user_message.append("\n# Task description") + user_message.append(task_description) + user_message.append("\n# Text that may contain an answer") + user_message.append(response_to_be_graded) + user_message_arg: UserContent = user_message + self._clear_history() + extracted_answer = await self.call_model( + summary="Ask the model to extract the answer", + system_message_content=sys_message, + user_content=user_message_arg, + ) + self.logger.info("Extracted answer: " + extracted_answer) + + # Ask the model to check the answer for correctness. + user_message = [ + """Your job is to decide whether a given answer to a task is correct or not. +- You will be given the task description and the correct, gold-standard answer, along with the answer to be graded. +- In general, an answer is correct if it is equivalent to the correct answer. +- Specifically, the given answer must contain the important information from the correct answer, and must not in any way contradict the correct answer. +- Ignore any differences of grammar, spelling mistakes, punctuation, capitalization, formatting, or extra commentary. +- An answer should be considered correct if it omits information that is clearly inferred. + - For instance, if the correct answer is "Paris, France", the answer "Paris" should be considered correct. +- Respond with a single character: '1' if the answer to be graded is correct", '0' if not.""" + ] + user_message.append("\n# Task description") + user_message.append(task_description) + user_message.append("\n# Correct answer") + user_message.append(correct_answer) + user_message.append("\n# Answer to be graded") + user_message.append(extracted_answer) + self._clear_history() + decision = await self.call_model( + summary="Ask the model to check the answer for correctness", + system_message_content=sys_message, + user_content=user_message, + ) + self.logger.info("Decision: " + decision) + + self.logger.leave_function() + return decision == "1", extracted_answer diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/page_logger.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/page_logger.py new file mode 100644 index 000000000000..92964dfbec12 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/page_logger.py @@ -0,0 +1,546 @@ +import inspect +import json +import os +import shutil +from typing import Any, Dict, List, Mapping, Optional, Sequence, TypedDict + +from autogen_agentchat.base import TaskResult +from autogen_agentchat.messages import AgentEvent, ChatMessage +from autogen_core import Image +from autogen_core.models import ( + AssistantMessage, + CreateResult, + FunctionExecutionResultMessage, + LLMMessage, + RequestUsage, + SystemMessage, + UserMessage, +) + +from ._functions import MessageContent, hash_directory + + +def _html_opening(file_title: str, finished: bool = False) -> str: + """ + Returns the opening text of a simple HTML file. + """ + refresh_tag = '' if not finished else "" + st = f""" + + + + {refresh_tag} + {file_title} + + + """ + return st + + +def _html_closing() -> str: + """ + Return the closing text of a simple HTML file. + """ + return """""" + + +# Following the nested-config pattern, this TypedDict minimizes code changes by encapsulating +# the settings that change frequently, as when loading many settings from a single YAML file. +class PageLoggerConfig(TypedDict, total=False): + level: str + path: str + + +class PageLogger: + """ + Logs text and images to a set of HTML pages, one per function/method, linked to each other in a call tree. + + Args: + config: An optional dict that can be used to override the following values: + + - level: The logging level, one of DEBUG, INFO, WARNING, ERROR, CRITICAL, or NONE. + - path: The path to the directory where the log files will be written. + """ + + def __init__(self, config: PageLoggerConfig | None = None) -> None: + self.levels = { + "DEBUG": 10, + "INFO": 20, + "WARNING": 30, + "ERROR": 40, + "CRITICAL": 50, + "NONE": 100, + } + + # Apply default settings and any config overrides. + level_str = "NONE" # Default to no logging at all. + self.log_dir = "./pagelogs/default" + if config is not None: + level_str = config.get("level", level_str) + self.log_dir = config.get("path", self.log_dir) + self.level = self.levels[level_str] + self.log_dir = os.path.expanduser(self.log_dir) + + # If the logging level is set to NONE or higher, don't log anything. + if self.level >= self.levels["NONE"]: + return + + self.page_stack = PageStack() + self.pages: List[Page] = [] + self.last_page_id = 0 + self.name = "0 Call Tree" + self._create_run_dir() + self.flush() + self.finalized = False + + def __del__(self) -> None: + self.finalize() + + def finalize(self) -> None: + # Writes a hash of the log directory to a file for change detection. + if self.level >= self.levels["NONE"]: + return + + # Don't finalize the log if it has already been finalized. + if self.finalized: + return + + # Do nothing if the app is being forced to exit early. + if self.page_stack.size() > 0: + return + + self.flush(finished=True) + + # Write the hash and other details to a file. + hash_str, num_files, num_subdirs = hash_directory(self.log_dir) + hash_path = os.path.join(self.log_dir, "hash.txt") + with open(hash_path, "w") as f: + f.write(hash_str) + f.write("\n") + f.write("{} files\n".format(num_files)) + f.write("{} subdirectories\n".format(num_subdirs)) + + self.finalized = True + + @staticmethod + def _decorate_text(text: str, color: str, weight: str = "bold", demarcate: bool = False) -> str: + """ + Returns a string of text with HTML styling for weight and color. + """ + if demarcate: + text = f"<<<<< {text} >>>>>" + return f'{text}' + + @staticmethod + def _link_to_image(image_path: str, description: str) -> str: + """ + Returns an HTML string defining a thumbnail link to an image. + """ + # To avoid a bug in heml rendering aht displays underscores to the left of thumbnails, + # define the following string on a single line. + link = f"""{description}""" + return link + + def _get_next_page_id(self) -> int: + """Returns the next page id and increments the counter.""" + self.last_page_id += 1 + return self.last_page_id + + def _create_run_dir(self) -> None: + """Creates a fresh log directory.""" + if os.path.exists(self.log_dir): + shutil.rmtree(self.log_dir) + os.makedirs(self.log_dir) + + def _add_page(self, summary: str, show_in_call_tree: bool = True, finished: bool = True) -> "Page": + """ + Adds a new page to the log. + """ + page = Page( + page_logger=self, + index=self._get_next_page_id(), + summary=summary, + indent_level=len(self.page_stack.stack), + show_in_call_tree=show_in_call_tree, + finished=finished, + ) + self.pages.append(page) + self.flush() + if len(self.page_stack.stack) > 0: + # Insert a link to the new page into the calling page. + self.info("\n" + page.full_link) + return page + + def _log_text(self, text: str) -> None: + """ + Adds text to the current page. + """ + page = self.page_stack.top() + if page is not None: + page.add_lines(text, flush=True) + + def debug(self, line: str) -> None: + """ + Adds DEBUG text to the current page if debugging level <= DEBUG. + """ + if self.level <= self.levels["DEBUG"]: + self._log_text(line) + + def info(self, line: str) -> None: + """ + Adds INFO text to the current page if debugging level <= INFO. + """ + if self.level <= self.levels["INFO"]: + self._log_text(line) + + def warning(self, line: str) -> None: + """ + Adds WARNING text to the current page if debugging level <= WARNING. + """ + if self.level <= self.levels["WARNING"]: + self._log_text(line) + + def error(self, line: str) -> None: + """ + Adds ERROR text to the current page if debugging level <= ERROR. + """ + if self.level <= self.levels["ERROR"]: + self._log_text(line) + + def critical(self, line: str) -> None: + """ + Adds CRITICAL text to the current page if debugging level <= CRITICAL. + """ + if self.level <= self.levels["CRITICAL"]: + self._log_text(line) + + def _message_source(self, message: LLMMessage) -> str: + """ + Returns a decorated string indicating the source of a message. + """ + source = "UNKNOWN" + color = "black" + if isinstance(message, SystemMessage): + source = "SYSTEM" + color = "purple" + elif isinstance(message, UserMessage): + source = "USER" + color = "blue" + elif isinstance(message, AssistantMessage): + source = "ASSISTANT" + color = "green" + elif isinstance(message, FunctionExecutionResultMessage): + source = "FUNCTION" + color = "red" + return self._decorate_text(source, color, demarcate=True) + + def _format_message_content(self, message_content: MessageContent) -> str: + """ + Formats the message content for logging. + """ + # Start by converting the message content to a list of strings. + content_list: List[str] = [] + content = message_content + if isinstance(content, str): + content_list.append(content) + elif isinstance(content, list): + for item in content: + if isinstance(item, str): + content_list.append(item.rstrip()) + elif isinstance(item, Image): + # Save the image to disk. + image_filename = str(self._get_next_page_id()) + " image.jpg" + image_path = os.path.join(self.log_dir, image_filename) + item.image.save(image_path) + # Add a link to the image. + content_list.append(self._link_to_image(image_filename, "message_image")) + elif isinstance(item, Dict): + # Add a dictionary to the log. + json_str = json.dumps(item, indent=4) + content_list.append(json_str) + else: + content_list.append(str(item).rstrip()) + else: + content_list.append("") + + # Convert the list of strings to a single string containing newline separators. + output = "" + for item in content_list: + output += f"\n{item}\n" + return output + + def log_message_content(self, message_content: MessageContent, summary: str) -> None: + """ + Adds a page containing the message's content, including any images. + """ + if self.level > self.levels["INFO"]: + return None + page = self._add_page(summary=summary, show_in_call_tree=False) + self.page_stack.write_stack_to_page(page) + page.add_lines(self._format_message_content(message_content=message_content)) + page.flush() + + def log_dict_list(self, content: List[Mapping[str, Any]], summary: str) -> None: + """ + Adds a page containing a list of dicts. + """ + if self.level > self.levels["INFO"]: + return None + page = self._add_page(summary=summary, show_in_call_tree=False) + self.page_stack.write_stack_to_page(page) + + for item in content: + json_str = json.dumps(item, indent=4) + page.add_lines(json_str) + + page.flush() + + def _log_model_messages( + self, summary: str, input_messages: List[LLMMessage], response_str: str, usage: RequestUsage | None + ) -> Optional["Page"]: + """ + Adds a page containing the messages to a model (including any input images) and its response. + """ + page = self._add_page(summary=summary, show_in_call_tree=False) + self.page_stack.write_stack_to_page(page) + + if usage is not None: + page.add_lines("{} prompt tokens".format(usage.prompt_tokens)) + page.add_lines("{} completion tokens".format(usage.completion_tokens)) + for m in input_messages: + page.add_lines("\n" + self._message_source(m)) + page.add_lines(self._format_message_content(message_content=m.content)) + page.add_lines("\n" + self._decorate_text("ASSISTANT RESPONSE", "green", demarcate=True)) + page.add_lines("\n" + response_str + "\n") + page.flush() + return page + + def log_model_call( + self, summary: str, input_messages: List[LLMMessage], response: CreateResult + ) -> Optional["Page"]: + """ + Logs messages sent to a model and the TaskResult response to a new page. + """ + if self.level > self.levels["INFO"]: + return None + + response_str = response.content + if not isinstance(response_str, str): + response_str = "??" + + page = self._log_model_messages(summary, input_messages, response_str, response.usage) + return page + + def log_model_task( + self, summary: str, input_messages: List[LLMMessage], task_result: TaskResult + ) -> Optional["Page"]: + """ + Logs messages sent to a model and the TaskResult response to a new page. + """ + if self.level > self.levels["INFO"]: + return None + + messages: Sequence[AgentEvent | ChatMessage] = task_result.messages + message = messages[-1] + response_str = message.content + if not isinstance(response_str, str): + response_str = "??" + + if hasattr(message, "models_usage"): + usage: RequestUsage | None = message.models_usage + else: + usage = RequestUsage(prompt_tokens=0, completion_tokens=0) + + page = self._log_model_messages(summary, input_messages, response_str, usage) + return page + + def log_link_to_local_file(self, file_path: str) -> str: + """ + Returns a link to a local file in the log. + """ + file_name = os.path.basename(file_path) + link = f'{file_name}' + return link + + def add_link_to_image(self, description: str, source_image_path: str) -> None: + """ + Inserts a thumbnail link to an image to the page. + """ + # Remove every character from the string 'description' that is not alphanumeric or a space. + description = "".join(e for e in description if e.isalnum() or e.isspace()) + target_image_filename = str(self._get_next_page_id()) + " - " + description + # Copy the image to the log directory. + local_image_path = os.path.join(self.log_dir, target_image_filename) + shutil.copyfile(source_image_path, local_image_path) + self._log_text("\n" + description) + self._log_text(self._link_to_image(target_image_filename, description)) + + def flush(self, finished: bool = False) -> None: + """ + Writes the current state of the log to disk. + """ + if self.level > self.levels["INFO"]: + return + # Create a call tree of the log. + call_tree_path = os.path.join(self.log_dir, self.name + ".html") + with open(call_tree_path, "w") as f: + f.write(_html_opening("0 Call Tree", finished=finished)) + f.write(f"

{self.name}

") + f.write("\n") + for page in self.pages: + if page.show_in_call_tree: + f.write(page.line_text + "\n") + f.write("\n") + f.write(_html_closing()) + + def enter_function(self) -> Optional["Page"]: + """ + Adds a new page corresponding to the current function call. + """ + if self.level > self.levels["INFO"]: + return None + + page = None + frame_type = inspect.currentframe() + if frame_type is not None: + frame = frame_type.f_back # Get the calling frame + if frame is not None: + # Check if it's a method by looking for 'self' or 'cls' in f_locals + if "self" in frame.f_locals: + class_name = type(frame.f_locals["self"]).__name__ + elif "cls" in frame.f_locals: + class_name = frame.f_locals["cls"].__name__ + else: + class_name = None # Not part of a class + + if class_name is None: # Not part of a class + caller_name = frame.f_code.co_name + else: + caller_name = class_name + "." + frame.f_code.co_name + + # Create a new page for this function. + page = self._add_page(summary=caller_name, show_in_call_tree=True, finished=False) + self.page_stack.push(page) + self.page_stack.write_stack_to_page(page) + + page.add_lines("\nENTER {}".format(caller_name), flush=True) + return page + + def leave_function(self) -> None: + """ + Finishes the page corresponding to the current function call. + """ + if self.level > self.levels["INFO"]: + return None + page = self.page_stack.top() + if page is not None: + page.finished = True + page.add_lines("\nLEAVE {}".format(page.summary), flush=True) + self.page_stack.pop() + + +class Page: + """ + Represents a single HTML page in the logger output. + + Args: + page_logger: The PageLogger object that created this page. + index: The index of the page. + summary: A brief summary of the page's contents for display. + indent_level: The level of indentation in the call tree. + show_in_call_tree: Whether to display the page in the call tree. + finished: Whether the page is complete. + """ + + def __init__( + self, + page_logger: PageLogger, + index: int, + summary: str, + indent_level: int, + show_in_call_tree: bool = True, + finished: bool = True, + ): + """ + Initializes and writes to a new HTML page. + """ + self.page_logger = page_logger + self.index_str = str(index) + self.summary = summary + self.indent_level = indent_level + self.show_in_call_tree = show_in_call_tree + self.finished = finished + self.file_title = self.index_str + " " + self.summary + self.indentation_text = "| " * self.indent_level + self.full_link = f'{self.file_title}' + self.line_text = self.indentation_text + self.full_link + self.lines: List[str] = [] + self.flush() + + def add_lines(self, lines: str, flush: bool = False) -> None: + """ + Adds one or more lines to the page. + """ + lines_to_add: List[str] = [] + if "\n" in lines: + lines_to_add = lines.split("\n") + else: + lines_to_add.append(lines) + self.lines.extend(lines_to_add) + if flush: + self.flush() + + def flush(self) -> None: + """ + Writes the HTML page to disk. + """ + page_path = os.path.join(self.page_logger.log_dir, self.index_str + ".html") + with open(page_path, "w") as f: + f.write(_html_opening(self.file_title, finished=self.finished)) + f.write(f"

{self.file_title}

\n") + for line in self.lines: + try: + f.write(f"{line}\n") + except UnicodeEncodeError: + f.write("UnicodeEncodeError in this line.\n") + f.write(_html_closing()) + f.flush() + + +class PageStack: + """ + A call stack containing a list of currently active function pages in the order they called each other. + """ + + def __init__(self) -> None: + self.stack: List[Page] = [] + + def push(self, page: Page) -> None: + """Adds a page to the top of the stack.""" + self.stack.append(page) + + def pop(self) -> Page: + """Removes and returns the top page from the stack""" + return self.stack.pop() + + def size(self) -> int: + """Returns the number of pages in the stack.""" + return len(self.stack) + + def top(self) -> Page | None: + """Returns the top page from the stack without removing it""" + if self.size() == 0: + return None + return self.stack[-1] + + def write_stack_to_page(self, page: Page) -> None: + # Logs a properly indented string displaying the current call stack. + page.add_lines("\nCALL STACK") + for stack_page in self.stack: + page.add_lines(stack_page.line_text) + page.add_lines("") + page.add_lines("") + page.flush() diff --git a/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/teachability.py b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/teachability.py new file mode 100644 index 000000000000..f8a09ee40e34 --- /dev/null +++ b/python/packages/autogen-ext/src/autogen_ext/experimental/task_centric_memory/utils/teachability.py @@ -0,0 +1,132 @@ +from typing import TYPE_CHECKING, Any + +from autogen_core import CancellationToken, Image +from autogen_core.memory import Memory, MemoryContent, MemoryMimeType, MemoryQueryResult, UpdateContextResult +from autogen_core.model_context import ChatCompletionContext +from autogen_core.models import UserMessage + +if TYPE_CHECKING: + from autogen_ext.experimental.task_centric_memory import MemoryController + + +class Teachability(Memory): + """ + Gives an AssistantAgent the ability to learn quickly from user teachings, hints, and advice. + + Steps for usage: + 1. Instantiate MemoryController. + 2. Instantiate Teachability, passing the memory controller as a parameter. + 3. Instantiate an AssistantAgent, passing the teachability instance (wrapped in a list) as the memory parameter. + 4. Use the AssistantAgent as usual, such as for chatting with the user. + """ + + def __init__(self, memory_controller: "MemoryController", name: str | None = None) -> None: + """Initialize Teachability.""" + self._memory_controller = memory_controller + self._logger = memory_controller.logger + self._name = name or "teachability" + + @property + def name(self) -> str: + """Get the memory instance identifier.""" + return self._name + + def _extract_text(self, content_item: str | MemoryContent) -> str: + """Extract searchable text from content.""" + if isinstance(content_item, str): + return content_item + + content = content_item.content + mime_type = content_item.mime_type + + if mime_type in [MemoryMimeType.TEXT, MemoryMimeType.MARKDOWN]: + return str(content) + elif mime_type == MemoryMimeType.JSON: + if isinstance(content, dict): + # Store original JSON string representation + return str(content).lower() + raise ValueError("JSON content must be a dict") + elif isinstance(content, Image): + raise ValueError("Image content cannot be converted to text") + else: + raise ValueError(f"Unsupported content type: {mime_type}") + + async def update_context( + self, + model_context: ChatCompletionContext, + ) -> UpdateContextResult: + """ + Extracts any advice from the last user turn to be stored in memory, + and adds any relevant memories to the model context. + """ + self._logger.enter_function() + + # Extract text from the user's last message + messages = await model_context.get_messages() + if not messages: + self._logger.leave_function() + return UpdateContextResult(memories=MemoryQueryResult(results=[])) + last_message = messages[-1] + last_user_text = last_message.content if isinstance(last_message.content, str) else str(last_message) + + # Add any relevant memories to the chat history + query_results = await self.query(last_user_text) + if query_results.results: + memory_strings = [f"{i}. {str(memory.content)}" for i, memory in enumerate(query_results.results, 1)] + memory_context = "\nPotentially relevant memories:\n" + "\n".join(memory_strings) + await model_context.add_message(UserMessage(content=memory_context, source="user")) + + # Add any user advice to memory + await self._memory_controller.consider_memo_storage(last_user_text) + + self._logger.leave_function() + return UpdateContextResult(memories=query_results) + + async def add(self, content: MemoryContent, cancellation_token: CancellationToken | None = None) -> None: + """ + Tries to extract any advice from the passed content and add it to memory. + """ + self._logger.enter_function() + + # Extract text from the incoming content + text = self._extract_text(content) + + # Check for advice to add to memory for later turns. + await self._memory_controller.consider_memo_storage(text) + + self._logger.leave_function() + + async def query( + self, + query: str | MemoryContent, + cancellation_token: CancellationToken | None = None, + **kwargs: Any, + ) -> MemoryQueryResult: + """ + Returns any memories that seem relevant to the query. + """ + self._logger.enter_function() + + task = self._extract_text(query) + memory_results: list[MemoryContent] = [] + filtered_memos = await self._memory_controller.retrieve_relevant_memos(task=task) + filtered_insights = [memo.insight for memo in filtered_memos] + for insight in filtered_insights: + self._logger.info(f"Insight: {insight}") + memory_content = MemoryContent( + content=insight, + mime_type="MemoryMimeType.TEXT", + metadata={}, + ) + memory_results.append(memory_content) + + self._logger.leave_function() + return MemoryQueryResult(results=memory_results) + + async def clear(self) -> None: + """Clear all entries from memory.""" + self._memory_controller.reset_memory() + + async def close(self) -> None: + """Clean up memory resources.""" + pass # No cleanup needed for this memory implementation diff --git a/python/packages/autogen-ext/tests/task_centric_memory/configs/demonstration.yaml b/python/packages/autogen-ext/tests/task_centric_memory/configs/demonstration.yaml new file mode 100644 index 000000000000..88787c083ccf --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/configs/demonstration.yaml @@ -0,0 +1,31 @@ + +PageLogger: + level: DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL, or NONE. + path: ./tests/task_centric_memory/pagelogs/demonstration + +client: + model: gpt-4o-2024-08-06 + temperature: 0.8 + max_completion_tokens: 4096 + presence_penalty: 0.0 + frequency_penalty: 0.0 + top_p: 1.0 + max_retries: 65535 + +Apprentice: + name_of_agent_or_team: AssistantAgent # AssistantAgent or MagenticOneGroupChat + disable_prefix_caching: 0 # If true, prepends a small random string to the context, to decorrelate repeated runs. + MemoryController: + max_train_trials: 10 + max_test_trials: 3 + MemoryBank: + path: ./tests/task_centric_memory/memory_bank/demonstration + relevance_conversion_threshold: 1.7 + n_results: 25 + distance_threshold: 100 + +test: + main_task_file: tests/task_centric_memory/data_files/tasks/cell_towers_1.yaml # The task being tested. + demo_task_file: tests/task_centric_memory/data_files/tasks/cell_towers_2.yaml # A similar but different task. + demo_solution_file: tests/task_centric_memory/data_files/insights/cell_towers_2_demo.yaml # A demonstration of solving the second task. + num_trials: 1 diff --git a/python/packages/autogen-ext/tests/task_centric_memory/configs/self_teaching.yaml b/python/packages/autogen-ext/tests/task_centric_memory/configs/self_teaching.yaml new file mode 100644 index 000000000000..530fdb9bcba9 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/configs/self_teaching.yaml @@ -0,0 +1,31 @@ + +PageLogger: + level: DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL, or NONE. + path: ./tests/task_centric_memory/pagelogs/self_teaching + +client: + model: gpt-4o-2024-08-06 + temperature: 0.8 + max_completion_tokens: 4096 + presence_penalty: 0.0 + frequency_penalty: 0.0 + top_p: 1.0 + max_retries: 65535 + +Apprentice: + name_of_agent_or_team: AssistantAgent # AssistantAgent or MagenticOneGroupChat + disable_prefix_caching: 0 # If true, prepends a small random string to the context, to decorrelate repeated runs. + MemoryController: + max_train_trials: 2 + max_test_trials: 1 + MemoryBank: + path: ./tests/task_centric_memory/memory_bank/self_teaching + relevance_conversion_threshold: 1.7 + n_results: 25 + distance_threshold: 100 + +test: + task_file_1: tests/task_centric_memory/data_files/tasks/10_liars.yaml # Train and test on this task. + task_file_2: tests/task_centric_memory/data_files/tasks/100_vampires.yaml # Test generalization on this different, similar task. + num_loops: 1 + num_final_test_trials: 1 diff --git a/python/packages/autogen-ext/tests/task_centric_memory/configs/teachability.yaml b/python/packages/autogen-ext/tests/task_centric_memory/configs/teachability.yaml new file mode 100644 index 000000000000..8ad036acc0bf --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/configs/teachability.yaml @@ -0,0 +1,29 @@ + +PageLogger: + level: DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL, or NONE. + path: ./tests/task_centric_memory/pagelogs/teachability + +client: + model: gpt-4o-2024-08-06 + temperature: 0.8 + max_completion_tokens: 4096 + presence_penalty: 0.0 + frequency_penalty: 0.0 + top_p: 1.0 + max_retries: 65535 + +Apprentice: + name_of_agent_or_team: AssistantAgent # AssistantAgent or MagenticOneGroupChat + disable_prefix_caching: 0 # If true, prepends a small random string to the context, to decorrelate repeated runs. + MemoryController: + max_train_trials: 10 + max_test_trials: 3 + MemoryBank: + path: ./tests/task_centric_memory/memory_bank/teachability + relevance_conversion_threshold: 1.7 + n_results: 25 + distance_threshold: 100 + +test: + task_file: tests/task_centric_memory/data_files/tasks/autogen_package.yaml # The task being tested. + insight_file: tests/task_centric_memory/data_files/insights/add_topic.yaml # Advice provided to help solve the task. diff --git a/python/packages/autogen-ext/tests/task_centric_memory/data_files/insights/add_topic.yaml b/python/packages/autogen-ext/tests/task_centric_memory/data_files/insights/add_topic.yaml new file mode 100644 index 000000000000..05e334ccbf7e --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/data_files/insights/add_topic.yaml @@ -0,0 +1,6 @@ +# Advice created manually for solving the autogen_package task. + +insight: 'When somebody builds something on autogen and wants to contribute it to autogen, + instead of creating a new autogen package, it''s better for them to implement it in a separate github repo, + then just add the autogen-extension topic to that repo. + That way their contribution will be automatically discoverable through autogen''s page on community extensions: https://microsoft.github.io/autogen/dev/user-guide/extensions-user-guide/index.html' diff --git a/python/packages/autogen-ext/tests/task_centric_memory/data_files/insights/cell_towers_2_demo.yaml b/python/packages/autogen-ext/tests/task_centric_memory/data_files/insights/cell_towers_2_demo.yaml new file mode 100644 index 000000000000..ca00473067ef --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/data_files/insights/cell_towers_2_demo.yaml @@ -0,0 +1,11 @@ +# Demo created manually for solving the cell_towers_2 task. + +insight: 'Sort the houses by location: 3, 6, 10, 11, 12, 17, 19, 20. + Then start at one end and place the towers only where absolutely needed. + The house at 3 could be served by a tower as far away as mile marker 7, because 3 + 4 = 7, so place a tower at 7. + This obviously covers houses up to mile 7. + But a coverage radius of 4 miles (in each direction) means a total coverage of 8 miles. + So the tower at mile 7 would reach all the way to mile 11, covering the houses at 10 and 11. + The next uncovered house would be at mile 12 (not 10), requiring a second tower. + It could go at mile 16 (which is 12 + 4) and this tower would reach up to mile 20 (16 + 4), + covering the remaining houses. So 2 towers would be enough.' diff --git a/python/packages/autogen-ext/tests/task_centric_memory/data_files/insights/liar_advice.yaml b/python/packages/autogen-ext/tests/task_centric_memory/data_files/insights/liar_advice.yaml new file mode 100644 index 000000000000..502379d37896 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/data_files/insights/liar_advice.yaml @@ -0,0 +1,6 @@ +# Advice created automatically for solving the 10_liars task. + +insight: 'When solving logic puzzles, carefully consider all possible scenarios, + including the simplest ones, and remember that if everyone is lying, + their statements should naturally align with the known conditions without needing a truth-teller. + Always double-check that your conclusions don''t inadvertently introduce contradictions.' diff --git a/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/100_vampires.yaml b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/100_vampires.yaml new file mode 100644 index 000000000000..2e2341d91fd1 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/100_vampires.yaml @@ -0,0 +1,22 @@ +# From GAIA L1 + +task_description: "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021B\ + cu IV, son of Costea, has tasked you with investigating the village of \u0218\ + irnea in neighboring Wallachia. The Count's advisors have reported that a vampire\ + \ was spotted crossing the border near the village, and would like you to investigate\ + \ it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation.\ + \ One night, just before dawn, you catch a glimpse of a man in a long black\ + \ cape with red lining leaping from roof-top to roof-top with superhuman agility.\ + \ It's a vampire! You try to chase the creature back to its home, but the creature\ + \ is too fast. However, because of the remoteness of the village, you know with\ + \ absolute certainty that the vampire must be a resident of the village. You\ + \ decide that your best course of action will be to visit all 100 residents\ + \ of the town during the day. You know something about vampires and humans that\ + \ will make your investigation possible; humans always tell the truth, but vampires\ + \ always lie.\n\nIn the afternoon, you go from house to house, speaking with\ + \ all 100 residents of \u0218irnea. You ask everyone the same question: \"How\ + \ many vampires are living in \u0218irnea\". Everyone in the village gives the\ + \ same response, \"At least one of us is a human.\"\n\nHow many residents of\ + \ \u0218irnea have been turned into vampires?" + +expected_answer: '100' diff --git a/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/10_liars.yaml b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/10_liars.yaml new file mode 100644 index 000000000000..096e12775935 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/10_liars.yaml @@ -0,0 +1,8 @@ +# Similar to the 100 vampires task, for testing generalization from one to the other. + +task_description: 'You ask ten people ''How many of you are liars?'' + They all answer ''At least one of us is not a liar.'' + You happen to know that at least one of them IS a liar. + How many of them are liars in total?' + +expected_answer: All of them are liars. diff --git a/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/autogen_package.yaml b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/autogen_package.yaml new file mode 100644 index 000000000000..f80840b30073 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/autogen_package.yaml @@ -0,0 +1,5 @@ +# Test where human advice is needed. + +task_description: As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen? + +expected_answer: It's best to have your agent in its own repo, then add the autogen-extension topic to that repo. diff --git a/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/cell_towers_1.yaml b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/cell_towers_1.yaml new file mode 100644 index 000000000000..f86e370db3ee --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/cell_towers_1.yaml @@ -0,0 +1,9 @@ +# File-free version of a GAIA L1 task. + +task_description: You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. + Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. + Each cell phone tower can cover houses located next to the road within a 4-mile radius. + Find the minimum number of cell phone towers needed to cover all houses next to the road. + Your answer should be a positive numerical integer value. + +expected_answer: '2' diff --git a/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/cell_towers_2.yaml b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/cell_towers_2.yaml new file mode 100644 index 000000000000..5ddc046920c9 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/data_files/tasks/cell_towers_2.yaml @@ -0,0 +1,9 @@ +# Similar to the cell_towers_1 task. + +task_description: You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. + Houses are located at mile markers 17, 20, 19, 10, 11, 12, 3, 6. + Each cell phone tower can cover houses located next to the road within a 4-mile radius. + Find the minimum number of cell phone towers needed to cover all houses next to the road. + Your answer should be a positive numerical integer value. + +expected_answer: '2' diff --git a/python/packages/autogen-ext/tests/task_centric_memory/sessions/demonstration/session.json b/python/packages/autogen-ext/tests/task_centric_memory/sessions/demonstration/session.json new file mode 100644 index 000000000000..8fd867d00434 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/sessions/demonstration/session.json @@ -0,0 +1,407 @@ +[ + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.\nIn responding to every user message, you follow the same multi-step process given here:\n1. Explain your understanding of the user message in detail, covering all the important points.\n2. List as many possible responses as you can think of.\n3. Carefully list and weigh the pros and cons (if any) of each possible response.\n4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist.\n5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory.\n6. Finish by providing your final response in the particular format requested by the user.", + "type": "SystemMessage" + }, + { + "content": "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.", + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1. **Understanding the User Message:**\n - You need to determine the minimum number of cell phone towers required to cover all houses located at specific mile markers.\n - Each tower covers a 4-mile radius, meaning it can cover any house located within 4 miles in either direction.\n - The mile markers where the houses are located are: 16, 17, 19, 11, 9, 10, 2, 5, 4.\n\n2. **Possible Responses:**\n - Calculate the optimal placement of towers to cover all houses with a minimal number of towers.\n\n3. **Weighing the Pros and Cons:**\n - **Calculate Optimal Placement:**\n - **Pros:** Provides a clear and accurate solution to cover all houses with the fewest towers.\n - **Cons:** Requires careful consideration of overlapping coverage areas.\n\n4. **Critique the Pros and Cons:**\n - The only viable approach is to calculate the optimal placement as it directly addresses the problem without unnecessary steps.\n\n5. **Decide on the Best Response:**\n - Calculating the optimal placement is indeed the best approach.\n\n6. **Final Response:**\n - To cover all houses, place towers at the following mile markers:\n - One tower at mile 3 (covers houses at 2, 4, 5).\n - Another tower at mile 13 (covers houses at 9, 10, 11).\n - Another tower at mile 17 (covers houses at 16, 17, 19).\n - Thus, the minimum number of towers needed is 3.", + "usage": { + "prompt_tokens": 252, + "completion_tokens": 335 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to extract a possible answer to the following question from the given text.\n- First review the following task.\n- Then review the text that follows, which may an answer, plus reasoning that led to the answer.\n- Do not attempt to actually solve the task yourself.\n- Don't try to judge whether the reasoning steps were correct.\n- Simply respond by summarizing the answer described in the text, omitting any other parts of the text.\n- If no answer is present can be extracted from the text, simply reply \"None\".", + "\n# Task description", + "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.", + "\n# Text that may contain an answer", + "1. **Understanding the User Message:**\n - You need to determine the minimum number of cell phone towers required to cover all houses located at specific mile markers.\n - Each tower covers a 4-mile radius, meaning it can cover any house located within 4 miles in either direction.\n - The mile markers where the houses are located are: 16, 17, 19, 11, 9, 10, 2, 5, 4.\n\n2. **Possible Responses:**\n - Calculate the optimal placement of towers to cover all houses with a minimal number of towers.\n\n3. **Weighing the Pros and Cons:**\n - **Calculate Optimal Placement:**\n - **Pros:** Provides a clear and accurate solution to cover all houses with the fewest towers.\n - **Cons:** Requires careful consideration of overlapping coverage areas.\n\n4. **Critique the Pros and Cons:**\n - The only viable approach is to calculate the optimal placement as it directly addresses the problem without unnecessary steps.\n\n5. **Decide on the Best Response:**\n - Calculating the optimal placement is indeed the best approach.\n\n6. **Final Response:**\n - To cover all houses, place towers at the following mile markers:\n - One tower at mile 3 (covers houses at 2, 4, 5).\n - Another tower at mile 13 (covers houses at 9, 10, 11).\n - Another tower at mile 17 (covers houses at 16, 17, 19).\n - Thus, the minimum number of towers needed is 3." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "3", + "usage": { + "prompt_tokens": 571, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to decide whether a given answer to a task is correct or not.\n- You will be given the task description and the correct, gold-standard answer, along with the answer to be graded.\n- In general, an answer is correct if it is equivalent to the correct answer.\n- Specifically, the given answer must contain the important information from the correct answer, and must not in any way contradict the correct answer.\n- Ignore any differences of grammar, spelling mistakes, punctuation, capitalization, formatting, or extra commentary.\n- An answer should be considered correct if it omits information that is clearly inferred.\n - For instance, if the correct answer is \"Paris, France\", the answer \"Paris\" should be considered correct.\n- Respond with a single character: '1' if the answer to be graded is correct\", '0' if not.", + "\n# Task description", + "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.", + "\n# Correct answer", + "2", + "\n# Answer to be graded", + "3" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "0", + "usage": { + "prompt_tokens": 305, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are an expert at semantic analysis.", + "type": "SystemMessage" + }, + { + "content": [ + "- My job is to create a thorough index for a book called Task Completion, and I need your help.\n- Every paragraph in the book needs to be indexed by all the topics related to various kinds of tasks and strategies for completing them.\n- Your job is to read the text below and extract the task-completion topics that are covered.\n- The number of topics depends on the length and content of the text. But you should list at least one topic, and potentially many more.\n- Each topic you list should be a meaningful phrase composed of a few words. Don't use whole sentences as topics.\n- Don't include details that are unrelated to the general nature of the task, or a potential strategy for completing tasks.\n- List each topic on a separate line, without any extra text like numbering, or bullets, or any other formatting, because we don't want those things in the index of the book.\n\n", + "# Text to be indexed\n", + "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 17, 20, 19, 10, 11, 12, 3, 6. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Telecommunications engineering \nCell phone tower placement \nCoverage radius calculation \nHouse location analysis \nMinimum resource allocation \nOptimizing coverage strategy", + "usage": { + "prompt_tokens": 302, + "completion_tokens": 26 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Here's a simplified breakdown of the task description:\n\n1. **Role**: You are acting as a telecommunications engineer.\n2. **Objective**: Determine how many cell phone towers are needed.\n3. **Location**: The task involves a specific stretch of road.\n4. **House Locations**: Houses are positioned at the following mile markers: 2, 4, 5, 9, 10, 11, 16, 17, and 19.\n5. **Coverage Requirement**: Each cell phone tower can cover an area within a 4-mile radius.\n6. **Task Goal**: Calculate the minimum number of towers required to cover all the houses mentioned.\n7. **Output**: Provide a single positive integer as the answer.", + "usage": { + "prompt_tokens": 185, + "completion_tokens": 156 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value." + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "Here's a simplified breakdown of the task description:\n\n1. **Role**: You are acting as a telecommunications engineer.\n2. **Objective**: Determine how many cell phone towers are needed.\n3. **Location**: The task involves a specific stretch of road.\n4. **House Locations**: Houses are positioned at the following mile markers: 2, 4, 5, 9, 10, 11, 16, 17, and 19.\n5. **Coverage Requirement**: Each cell phone tower can cover an area within a 4-mile radius.\n6. **Task Goal**: Calculate the minimum number of towers required to cover all the houses mentioned.\n7. **Output**: Provide a single positive integer as the answer.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "In the context of solving the task, some elements of the list may not be directly relevant to finding the solution:\n\n1. **Role**: Knowing that you are acting as a \"telecommunications engineer\" is not necessary for solving the problem. The focus should be on determining the number of towers, regardless of the role.\n\n2. **Location**: The specific mention of a \"stretch of road\" is not critically relevant as long as you know where the houses are located and their mile markers.\n\nThe core elements needed to solve the task are the **house locations**, **coverage requirement**, and the **goal** of determining the minimum number of towers. The output requirement is also relevant to ensure you present the solution in the correct form.", + "usage": { + "prompt_tokens": 378, + "completion_tokens": 149 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value." + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "Here's a simplified breakdown of the task description:\n\n1. **Role**: You are acting as a telecommunications engineer.\n2. **Objective**: Determine how many cell phone towers are needed.\n3. **Location**: The task involves a specific stretch of road.\n4. **House Locations**: Houses are positioned at the following mile markers: 2, 4, 5, 9, 10, 11, 16, 17, and 19.\n5. **Coverage Requirement**: Each cell phone tower can cover an area within a 4-mile radius.\n6. **Task Goal**: Calculate the minimum number of towers required to cover all the houses mentioned.\n7. **Output**: Provide a single positive integer as the answer.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "In the context of solving the task, some elements of the list may not be directly relevant to finding the solution:\n\n1. **Role**: Knowing that you are acting as a \"telecommunications engineer\" is not necessary for solving the problem. The focus should be on determining the number of towers, regardless of the role.\n\n2. **Location**: The specific mention of a \"stretch of road\" is not critically relevant as long as you know where the houses are located and their mile markers.\n\nThe core elements needed to solve the task are the **house locations**, **coverage requirement**, and the **goal** of determining the minimum number of towers. The output requirement is also relevant to ensure you present the solution in the correct form.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Revise your original list to include only the most general terms, those that are critical to solving the task, removing any themes or descriptions that are not essential to the solution. Your final list may be shorter, but do not leave out any part of the task that is needed for solving the task. Do not add any additional commentary either before or after the list." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1. House locations: mile markers 2, 4, 5, 9, 10, 11, 16, 17, 19.\n2. Coverage: each tower covers a 4-mile radius.\n3. Goal: find the minimum number of towers to cover all houses.\n4. Output: a single positive integer.", + "usage": { + "prompt_tokens": 611, + "completion_tokens": 72 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are an expert at semantic analysis.", + "type": "SystemMessage" + }, + { + "content": [ + "- My job is to create a thorough index for a book called Task Completion, and I need your help.\n- Every paragraph in the book needs to be indexed by all the topics related to various kinds of tasks and strategies for completing them.\n- Your job is to read the text below and extract the task-completion topics that are covered.\n- The number of topics depends on the length and content of the text. But you should list at least one topic, and potentially many more.\n- Each topic you list should be a meaningful phrase composed of a few words. Don't use whole sentences as topics.\n- Don't include details that are unrelated to the general nature of the task, or a potential strategy for completing tasks.\n- List each topic on a separate line, without any extra text like numbering, or bullets, or any other formatting, because we don't want those things in the index of the book.\n\n", + "# Text to be indexed\n", + "1. House locations: mile markers 2, 4, 5, 9, 10, 11, 16, 17, 19.\n2. Coverage: each tower covers a 4-mile radius.\n3. Goal: find the minimum number of towers to cover all houses.\n4. Output: a single positive integer." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "House locations \nCoverage radius \nMinimum coverage strategy \nOptimization of resources", + "usage": { + "prompt_tokens": 278, + "completion_tokens": 14 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a potential insight that may or may not be useful for solving a given task.\n- First review the following task.\n- Then review the insight that follows, and consider whether it might help solve the given task.\n- Do not attempt to actually solve the task.\n- Reply with a single character, '1' if the insight may be useful, or '0' if it is not.", + "\n# Task description", + "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.", + "\n# Possibly useful insight", + "Example task:\n\nYou are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 17, 20, 19, 10, 11, 12, 3, 6. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.\n\nExample solution:\n\nSort the houses by location: 3, 6, 10, 11, 12, 17, 19, 20. Then start at one end and place the towers only where absolutely needed. The house at 3 could be served by a tower as far away as mile marker 7, because 3 + 4 = 7, so place a tower at 7. This obviously covers houses up to mile 7. But a coverage radius of 4 miles (in each direction) means a total coverage of 8 miles. So the tower at mile 7 would reach all the way to mile 11, covering the houses at 10 and 11. The next uncovered house would be at mile 12 (not 10), requiring a second tower. It could go at mile 16 (which is 12 + 4) and this tower would reach up to mile 20 (16 + 4), covering the remaining houses. So 2 towers would be enough." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1", + "usage": { + "prompt_tokens": 516, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.\nIn responding to every user message, you follow the same multi-step process given here:\n1. Explain your understanding of the user message in detail, covering all the important points.\n2. List as many possible responses as you can think of.\n3. Carefully list and weigh the pros and cons (if any) of each possible response.\n4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist.\n5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory.\n6. Finish by providing your final response in the particular format requested by the user.", + "type": "SystemMessage" + }, + { + "content": "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.\n\n## Important insights that may help solve tasks like this\n- Example task:\n\nYou are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 17, 20, 19, 10, 11, 12, 3, 6. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.\n\nExample solution:\n\nSort the houses by location: 3, 6, 10, 11, 12, 17, 19, 20. Then start at one end and place the towers only where absolutely needed. The house at 3 could be served by a tower as far away as mile marker 7, because 3 + 4 = 7, so place a tower at 7. This obviously covers houses up to mile 7. But a coverage radius of 4 miles (in each direction) means a total coverage of 8 miles. So the tower at mile 7 would reach all the way to mile 11, covering the houses at 10 and 11. The next uncovered house would be at mile 12 (not 10), requiring a second tower. It could go at mile 16 (which is 12 + 4) and this tower would reach up to mile 20 (16 + 4), covering the remaining houses. So 2 towers would be enough.\n", + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "**Understanding of the User Message:**\n\nThe user is asking to determine the minimum number of cell phone towers needed to cover all houses located along a road, given specific mile markers for each house. Each tower has a coverage radius of 4 miles. The user also provided a similar example with a solution, which involves sorting the house locations, determining the optimal placement of cell phone towers, and ensuring all houses are within the coverage radius of at least one tower.\n\n**Possible Responses:**\n\n1. **Apply the algorithm from the example to the given problem:**\n - Sort the house locations.\n - Start placing towers where absolutely needed, considering the 4-mile radius.\n \n2. **Explain the step-by-step process for solving the given problem using a detailed breakdown.**\n\n3. **Provide a numerical integer value as the final answer without explanation.**\n\n4. **Explain why the given problem and example are similar and then solve the problem.**\n\n**Pros and Cons of Each Response:**\n\n1. **Apply the algorithm from the example to the given problem:**\n - Pros: Directly applies the method explained in the example, ensuring clarity and relevance.\n - Cons: Might not provide enough explanation for users unfamiliar with the approach.\n\n2. **Explain the step-by-step process for solving the given problem using a detailed breakdown:**\n - Pros: Provides clarity and understanding of the problem-solving process.\n - Cons: May be too verbose if the user only wants the final answer.\n\n3. **Provide a numerical integer value as the final answer without explanation:**\n - Pros: Quick and straight to the point, for users who want immediate results.\n - Cons: Lacks educational value and might not be satisfying for users looking to understand the solution.\n\n4. **Explain why the given problem and example are similar and then solve the problem:**\n - Pros: Shows a connection between similar problems, enhancing understanding.\n - Cons: Could be redundant if the user already understands the similarity.\n\n**Critique of the Pros and Cons:**\n\nEach response has its merits depending on the user's needs. However, since the user has provided a detailed example, it might be beneficial to walk through the steps to demonstrate understanding while delivering the answer concisely. This ensures that all users, whether looking for a quick answer or a detailed explanation, are satisfied.\n\n**Decide on the Best Response:**\n\nConsidering the balance between clarity and conciseness, a response that explains the process briefly and then provides the answer seems most appropriate.\n\n**Final Response:**\n\nTo solve the problem, follow these steps:\n\n1. **Sort the house locations**: 2, 4, 5, 9, 10, 11, 16, 17, 19.\n2. **Place towers strategically**:\n - Cover the house at mile 2 with a tower at mile 6 (2 + 4 = 6), covering up to mile 10.\n - The next uncovered house is at mile 11. Place another tower at mile 15 (11 + 4 = 15), covering up to mile 19.\n\nWith these placements, all houses are covered. Thus, the minimum number of cell phone towers needed is **2**.", + "usage": { + "prompt_tokens": 572, + "completion_tokens": 661 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to extract a possible answer to the following question from the given text.\n- First review the following task.\n- Then review the text that follows, which may an answer, plus reasoning that led to the answer.\n- Do not attempt to actually solve the task yourself.\n- Don't try to judge whether the reasoning steps were correct.\n- Simply respond by summarizing the answer described in the text, omitting any other parts of the text.\n- If no answer is present can be extracted from the text, simply reply \"None\".", + "\n# Task description", + "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.", + "\n# Text that may contain an answer", + "**Understanding of the User Message:**\n\nThe user is asking to determine the minimum number of cell phone towers needed to cover all houses located along a road, given specific mile markers for each house. Each tower has a coverage radius of 4 miles. The user also provided a similar example with a solution, which involves sorting the house locations, determining the optimal placement of cell phone towers, and ensuring all houses are within the coverage radius of at least one tower.\n\n**Possible Responses:**\n\n1. **Apply the algorithm from the example to the given problem:**\n - Sort the house locations.\n - Start placing towers where absolutely needed, considering the 4-mile radius.\n \n2. **Explain the step-by-step process for solving the given problem using a detailed breakdown.**\n\n3. **Provide a numerical integer value as the final answer without explanation.**\n\n4. **Explain why the given problem and example are similar and then solve the problem.**\n\n**Pros and Cons of Each Response:**\n\n1. **Apply the algorithm from the example to the given problem:**\n - Pros: Directly applies the method explained in the example, ensuring clarity and relevance.\n - Cons: Might not provide enough explanation for users unfamiliar with the approach.\n\n2. **Explain the step-by-step process for solving the given problem using a detailed breakdown:**\n - Pros: Provides clarity and understanding of the problem-solving process.\n - Cons: May be too verbose if the user only wants the final answer.\n\n3. **Provide a numerical integer value as the final answer without explanation:**\n - Pros: Quick and straight to the point, for users who want immediate results.\n - Cons: Lacks educational value and might not be satisfying for users looking to understand the solution.\n\n4. **Explain why the given problem and example are similar and then solve the problem:**\n - Pros: Shows a connection between similar problems, enhancing understanding.\n - Cons: Could be redundant if the user already understands the similarity.\n\n**Critique of the Pros and Cons:**\n\nEach response has its merits depending on the user's needs. However, since the user has provided a detailed example, it might be beneficial to walk through the steps to demonstrate understanding while delivering the answer concisely. This ensures that all users, whether looking for a quick answer or a detailed explanation, are satisfied.\n\n**Decide on the Best Response:**\n\nConsidering the balance between clarity and conciseness, a response that explains the process briefly and then provides the answer seems most appropriate.\n\n**Final Response:**\n\nTo solve the problem, follow these steps:\n\n1. **Sort the house locations**: 2, 4, 5, 9, 10, 11, 16, 17, 19.\n2. **Place towers strategically**:\n - Cover the house at mile 2 with a tower at mile 6 (2 + 4 = 6), covering up to mile 10.\n - The next uncovered house is at mile 11. Place another tower at mile 15 (11 + 4 = 15), covering up to mile 19.\n\nWith these placements, all houses are covered. Thus, the minimum number of cell phone towers needed is **2**." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "2", + "usage": { + "prompt_tokens": 897, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to decide whether a given answer to a task is correct or not.\n- You will be given the task description and the correct, gold-standard answer, along with the answer to be graded.\n- In general, an answer is correct if it is equivalent to the correct answer.\n- Specifically, the given answer must contain the important information from the correct answer, and must not in any way contradict the correct answer.\n- Ignore any differences of grammar, spelling mistakes, punctuation, capitalization, formatting, or extra commentary.\n- An answer should be considered correct if it omits information that is clearly inferred.\n - For instance, if the correct answer is \"Paris, France\", the answer \"Paris\" should be considered correct.\n- Respond with a single character: '1' if the answer to be graded is correct\", '0' if not.", + "\n# Task description", + "You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. Each cell phone tower can cover houses located next to the road within a 4-mile radius. Find the minimum number of cell phone towers needed to cover all houses next to the road. Your answer should be a positive numerical integer value.", + "\n# Correct answer", + "2", + "\n# Answer to be graded", + "2" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1", + "usage": { + "prompt_tokens": 305, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + } +] \ No newline at end of file diff --git a/python/packages/autogen-ext/tests/task_centric_memory/sessions/self_teaching/session.json b/python/packages/autogen-ext/tests/task_centric_memory/sessions/self_teaching/session.json new file mode 100644 index 000000000000..fbd6d9f126f8 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/sessions/self_teaching/session.json @@ -0,0 +1,1064 @@ +[ + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.\nIn responding to every user message, you follow the same multi-step process given here:\n1. Explain your understanding of the user message in detail, covering all the important points.\n2. List as many possible responses as you can think of.\n3. Carefully list and weigh the pros and cons (if any) of each possible response.\n4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist.\n5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory.\n6. Finish by providing your final response in the particular format requested by the user.", + "type": "SystemMessage" + }, + { + "content": "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1. **Understanding the User Message:**\n - You have asked ten people a question: \"How many of you are liars?\"\n - All ten of them respond with: \"At least one of us is not a liar.\"\n - You know that at least one of them is a liar.\n - You want to determine how many of the ten are liars in total.\n\n2. **Possible Responses:**\n - All ten people are liars.\n - Nine people are liars, and one is telling the truth.\n - Some other combination of liars and truth-tellers.\n\n3. **Pros and Cons of Each Possible Response:**\n - **All ten people are liars:**\n - Pros: This would satisfy your knowledge that at least one of them is a liar. If all are liars, their statement \"At least one of us is not a liar\" would be false.\n - Cons: If they were all liars, their statement \"At least one of us is not a liar\" would actually be true, which contradicts the definition of a liar.\n \n - **Nine people are liars, and one is telling the truth:**\n - Pros: This scenario fits with your knowledge that there is at least one liar. Nine liars making the statement \"At least one of us is not a liar\" would be false for them, while the one truth-teller would actually be making a true statement.\n - Cons: There are no apparent contradictions in this scenario.\n \n - **Some other combination:**\n - Pros: It's theoretically possible, but any other combination would not make the statement \"At least one of us is not a liar\" consistent for the liars.\n - Cons: This scenario could lead to contradictions because if more than one person were telling the truth, their statement would be true, contradicting the idea of them being liars.\n\n4. **Critique the Pros and Cons:**\n - The scenario where all ten are liars has a critical flaw: their statement would inadvertently be true, contradicting their status as liars.\n - The second scenario, where nine are liars and one is telling the truth, logically fits all the conditions without contradiction.\n\n5. **Decide on the Best Response:**\n - The most logical conclusion is that nine of the people are liars, and one is telling the truth.\n\n6. **Final Response:**\n In total, nine of them are liars.", + "usage": { + "prompt_tokens": 205, + "completion_tokens": 512 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to extract a possible answer to the following question from the given text.\n- First review the following task.\n- Then review the text that follows, which may an answer, plus reasoning that led to the answer.\n- Do not attempt to actually solve the task yourself.\n- Don't try to judge whether the reasoning steps were correct.\n- Simply respond by summarizing the answer described in the text, omitting any other parts of the text.\n- If no answer is present can be extracted from the text, simply reply \"None\".", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "\n# Text that may contain an answer", + "1. **Understanding the User Message:**\n - You have asked ten people a question: \"How many of you are liars?\"\n - All ten of them respond with: \"At least one of us is not a liar.\"\n - You know that at least one of them is a liar.\n - You want to determine how many of the ten are liars in total.\n\n2. **Possible Responses:**\n - All ten people are liars.\n - Nine people are liars, and one is telling the truth.\n - Some other combination of liars and truth-tellers.\n\n3. **Pros and Cons of Each Possible Response:**\n - **All ten people are liars:**\n - Pros: This would satisfy your knowledge that at least one of them is a liar. If all are liars, their statement \"At least one of us is not a liar\" would be false.\n - Cons: If they were all liars, their statement \"At least one of us is not a liar\" would actually be true, which contradicts the definition of a liar.\n \n - **Nine people are liars, and one is telling the truth:**\n - Pros: This scenario fits with your knowledge that there is at least one liar. Nine liars making the statement \"At least one of us is not a liar\" would be false for them, while the one truth-teller would actually be making a true statement.\n - Cons: There are no apparent contradictions in this scenario.\n \n - **Some other combination:**\n - Pros: It's theoretically possible, but any other combination would not make the statement \"At least one of us is not a liar\" consistent for the liars.\n - Cons: This scenario could lead to contradictions because if more than one person were telling the truth, their statement would be true, contradicting the idea of them being liars.\n\n4. **Critique the Pros and Cons:**\n - The scenario where all ten are liars has a critical flaw: their statement would inadvertently be true, contradicting their status as liars.\n - The second scenario, where nine are liars and one is telling the truth, logically fits all the conditions without contradiction.\n\n5. **Decide on the Best Response:**\n - The most logical conclusion is that nine of the people are liars, and one is telling the truth.\n\n6. **Final Response:**\n In total, nine of them are liars." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "In total, nine of them are liars.", + "usage": { + "prompt_tokens": 701, + "completion_tokens": 11 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to decide whether a given answer to a task is correct or not.\n- You will be given the task description and the correct, gold-standard answer, along with the answer to be graded.\n- In general, an answer is correct if it is equivalent to the correct answer.\n- Specifically, the given answer must contain the important information from the correct answer, and must not in any way contradict the correct answer.\n- Ignore any differences of grammar, spelling mistakes, punctuation, capitalization, formatting, or extra commentary.\n- An answer should be considered correct if it omits information that is clearly inferred.\n - For instance, if the correct answer is \"Paris, France\", the answer \"Paris\" should be considered correct.\n- Respond with a single character: '1' if the answer to be graded is correct\", '0' if not.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "\n# Correct answer", + "All of them are liars.", + "\n# Answer to be graded", + "In total, nine of them are liars." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "0", + "usage": { + "prompt_tokens": 272, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "- You are a patient and thorough teacher.\n- Your job is to review work done by students and help them learn how to do better.", + "type": "SystemMessage" + }, + { + "content": [ + "# A team of students made a mistake on the following task:\n", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "# Here's the expected answer, which would have been correct:\n", + "All of them are liars.", + "# Here is the students' answer, which was INCORRECT:\n", + "1. **Understanding the User Message:**\n - You have asked ten people a question: \"How many of you are liars?\"\n - All ten of them respond with: \"At least one of us is not a liar.\"\n - You know that at least one of them is a liar.\n - You want to determine how many of the ten are liars in total.\n\n2. **Possible Responses:**\n - All ten people are liars.\n - Nine people are liars, and one is telling the truth.\n - Some other combination of liars and truth-tellers.\n\n3. **Pros and Cons of Each Possible Response:**\n - **All ten people are liars:**\n - Pros: This would satisfy your knowledge that at least one of them is a liar. If all are liars, their statement \"At least one of us is not a liar\" would be false.\n - Cons: If they were all liars, their statement \"At least one of us is not a liar\" would actually be true, which contradicts the definition of a liar.\n \n - **Nine people are liars, and one is telling the truth:**\n - Pros: This scenario fits with your knowledge that there is at least one liar. Nine liars making the statement \"At least one of us is not a liar\" would be false for them, while the one truth-teller would actually be making a true statement.\n - Cons: There are no apparent contradictions in this scenario.\n \n - **Some other combination:**\n - Pros: It's theoretically possible, but any other combination would not make the statement \"At least one of us is not a liar\" consistent for the liars.\n - Cons: This scenario could lead to contradictions because if more than one person were telling the truth, their statement would be true, contradicting the idea of them being liars.\n\n4. **Critique the Pros and Cons:**\n - The scenario where all ten are liars has a critical flaw: their statement would inadvertently be true, contradicting their status as liars.\n - The second scenario, where nine are liars and one is telling the truth, logically fits all the conditions without contradiction.\n\n5. **Decide on the Best Response:**\n - The most logical conclusion is that nine of the people are liars, and one is telling the truth.\n\n6. **Final Response:**\n In total, nine of them are liars.", + "# Please review the students' work which follows:\n", + "**----- START OF STUDENTS' WORK -----**\n\n", + "1. **Understanding the User Message:**\n - You have asked ten people a question: \"How many of you are liars?\"\n - All ten of them respond with: \"At least one of us is not a liar.\"\n - You know that at least one of them is a liar.\n - You want to determine how many of the ten are liars in total.\n\n2. **Possible Responses:**\n - All ten people are liars.\n - Nine people are liars, and one is telling the truth.\n - Some other combination of liars and truth-tellers.\n\n3. **Pros and Cons of Each Possible Response:**\n - **All ten people are liars:**\n - Pros: This would satisfy your knowledge that at least one of them is a liar. If all are liars, their statement \"At least one of us is not a liar\" would be false.\n - Cons: If they were all liars, their statement \"At least one of us is not a liar\" would actually be true, which contradicts the definition of a liar.\n \n - **Nine people are liars, and one is telling the truth:**\n - Pros: This scenario fits with your knowledge that there is at least one liar. Nine liars making the statement \"At least one of us is not a liar\" would be false for them, while the one truth-teller would actually be making a true statement.\n - Cons: There are no apparent contradictions in this scenario.\n \n - **Some other combination:**\n - Pros: It's theoretically possible, but any other combination would not make the statement \"At least one of us is not a liar\" consistent for the liars.\n - Cons: This scenario could lead to contradictions because if more than one person were telling the truth, their statement would be true, contradicting the idea of them being liars.\n\n4. **Critique the Pros and Cons:**\n - The scenario where all ten are liars has a critical flaw: their statement would inadvertently be true, contradicting their status as liars.\n - The second scenario, where nine are liars and one is telling the truth, logically fits all the conditions without contradiction.\n\n5. **Decide on the Best Response:**\n - The most logical conclusion is that nine of the people are liars, and one is telling the truth.\n\n6. **Final Response:**\n In total, nine of them are liars.", + "\n**----- END OF STUDENTS' WORK -----**\n\n", + "# Now carefully review the students' work above, explaining in detail what the students did right and what they did wrong.\n" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "The students attempted to address a logical puzzle involving liars and truth-tellers, and their reasoning process included analyzing different scenarios. Here's a detailed review of their approach:\n\n### What the Students Did Right:\n\n1. **Understanding the Problem:** \n - They correctly identified the main elements of the problem: the ten people's statements and the need to determine how many are liars given that at least one is a liar.\n\n2. **Listing Possible Responses:**\n - They considered multiple scenarios: all ten are liars, nine are liars and one is truthful, and other combinations. This demonstrates a good approach to problem-solving by exploring different possibilities.\n\n3. **Analyzing Pros and Cons:**\n - The students evaluated the pros and cons of each scenario, which is a critical step in logical reasoning. They identified potential contradictions and reasons why certain scenarios might or might not work.\n\n### What the Students Did Wrong:\n\n1. **Misinterpretation of Liar Logic:**\n - The students incorrectly concluded that nine people being liars and one being truthful was the correct scenario. This analysis lacks a crucial understanding of liar logic. If all ten are liars, their statement \"At least one of us is not a liar\" is false, which is consistent with them being liars.\n\n2. **Incorrect Critique of All Liars Scenario:**\n - They misunderstood that if all ten were liars, the statement \"At least one of us is not a liar\" would indeed be a false statement, aligning with them being liars. Since liars' statements are false, claiming \"at least one of us is not a liar\" means all are liars.\n\n3. **Logical Flaw:**\n - The flaw in their reasoning is thinking that if all are liars, their statement becomes true. In fact, if all are liars, the statement is false, satisfying the conditions perfectly.\n\n### Correct Analysis:\n\n- **All are Liars:** In this scenario, the statement \"At least one of us is not a liar\" turns out to be false, as required by liars, because if it were true, then there would indeed be at least one non-liar, contradicting the given information.\n\n### Conclusion:\n\nThe correct solution is that all ten people are liars. The students need to revisit the logic of liar statements and improve their understanding of how falsity and truth are determined in such puzzles. This exercise is an excellent opportunity to practice logical reasoning and critical thinking skills.", + "usage": { + "prompt_tokens": 1215, + "completion_tokens": 506 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "- You are a patient and thorough teacher.\n- Your job is to review work done by students and help them learn how to do better.", + "type": "SystemMessage" + }, + { + "content": [ + "# A team of students made a mistake on the following task:\n", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "# Here's the expected answer, which would have been correct:\n", + "All of them are liars.", + "# Here is the students' answer, which was INCORRECT:\n", + "1. **Understanding the User Message:**\n - You have asked ten people a question: \"How many of you are liars?\"\n - All ten of them respond with: \"At least one of us is not a liar.\"\n - You know that at least one of them is a liar.\n - You want to determine how many of the ten are liars in total.\n\n2. **Possible Responses:**\n - All ten people are liars.\n - Nine people are liars, and one is telling the truth.\n - Some other combination of liars and truth-tellers.\n\n3. **Pros and Cons of Each Possible Response:**\n - **All ten people are liars:**\n - Pros: This would satisfy your knowledge that at least one of them is a liar. If all are liars, their statement \"At least one of us is not a liar\" would be false.\n - Cons: If they were all liars, their statement \"At least one of us is not a liar\" would actually be true, which contradicts the definition of a liar.\n \n - **Nine people are liars, and one is telling the truth:**\n - Pros: This scenario fits with your knowledge that there is at least one liar. Nine liars making the statement \"At least one of us is not a liar\" would be false for them, while the one truth-teller would actually be making a true statement.\n - Cons: There are no apparent contradictions in this scenario.\n \n - **Some other combination:**\n - Pros: It's theoretically possible, but any other combination would not make the statement \"At least one of us is not a liar\" consistent for the liars.\n - Cons: This scenario could lead to contradictions because if more than one person were telling the truth, their statement would be true, contradicting the idea of them being liars.\n\n4. **Critique the Pros and Cons:**\n - The scenario where all ten are liars has a critical flaw: their statement would inadvertently be true, contradicting their status as liars.\n - The second scenario, where nine are liars and one is telling the truth, logically fits all the conditions without contradiction.\n\n5. **Decide on the Best Response:**\n - The most logical conclusion is that nine of the people are liars, and one is telling the truth.\n\n6. **Final Response:**\n In total, nine of them are liars.", + "# Please review the students' work which follows:\n", + "**----- START OF STUDENTS' WORK -----**\n\n", + "1. **Understanding the User Message:**\n - You have asked ten people a question: \"How many of you are liars?\"\n - All ten of them respond with: \"At least one of us is not a liar.\"\n - You know that at least one of them is a liar.\n - You want to determine how many of the ten are liars in total.\n\n2. **Possible Responses:**\n - All ten people are liars.\n - Nine people are liars, and one is telling the truth.\n - Some other combination of liars and truth-tellers.\n\n3. **Pros and Cons of Each Possible Response:**\n - **All ten people are liars:**\n - Pros: This would satisfy your knowledge that at least one of them is a liar. If all are liars, their statement \"At least one of us is not a liar\" would be false.\n - Cons: If they were all liars, their statement \"At least one of us is not a liar\" would actually be true, which contradicts the definition of a liar.\n \n - **Nine people are liars, and one is telling the truth:**\n - Pros: This scenario fits with your knowledge that there is at least one liar. Nine liars making the statement \"At least one of us is not a liar\" would be false for them, while the one truth-teller would actually be making a true statement.\n - Cons: There are no apparent contradictions in this scenario.\n \n - **Some other combination:**\n - Pros: It's theoretically possible, but any other combination would not make the statement \"At least one of us is not a liar\" consistent for the liars.\n - Cons: This scenario could lead to contradictions because if more than one person were telling the truth, their statement would be true, contradicting the idea of them being liars.\n\n4. **Critique the Pros and Cons:**\n - The scenario where all ten are liars has a critical flaw: their statement would inadvertently be true, contradicting their status as liars.\n - The second scenario, where nine are liars and one is telling the truth, logically fits all the conditions without contradiction.\n\n5. **Decide on the Best Response:**\n - The most logical conclusion is that nine of the people are liars, and one is telling the truth.\n\n6. **Final Response:**\n In total, nine of them are liars.", + "\n**----- END OF STUDENTS' WORK -----**\n\n", + "# Now carefully review the students' work above, explaining in detail what the students did right and what they did wrong.\n" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "The students attempted to address a logical puzzle involving liars and truth-tellers, and their reasoning process included analyzing different scenarios. Here's a detailed review of their approach:\n\n### What the Students Did Right:\n\n1. **Understanding the Problem:** \n - They correctly identified the main elements of the problem: the ten people's statements and the need to determine how many are liars given that at least one is a liar.\n\n2. **Listing Possible Responses:**\n - They considered multiple scenarios: all ten are liars, nine are liars and one is truthful, and other combinations. This demonstrates a good approach to problem-solving by exploring different possibilities.\n\n3. **Analyzing Pros and Cons:**\n - The students evaluated the pros and cons of each scenario, which is a critical step in logical reasoning. They identified potential contradictions and reasons why certain scenarios might or might not work.\n\n### What the Students Did Wrong:\n\n1. **Misinterpretation of Liar Logic:**\n - The students incorrectly concluded that nine people being liars and one being truthful was the correct scenario. This analysis lacks a crucial understanding of liar logic. If all ten are liars, their statement \"At least one of us is not a liar\" is false, which is consistent with them being liars.\n\n2. **Incorrect Critique of All Liars Scenario:**\n - They misunderstood that if all ten were liars, the statement \"At least one of us is not a liar\" would indeed be a false statement, aligning with them being liars. Since liars' statements are false, claiming \"at least one of us is not a liar\" means all are liars.\n\n3. **Logical Flaw:**\n - The flaw in their reasoning is thinking that if all are liars, their statement becomes true. In fact, if all are liars, the statement is false, satisfying the conditions perfectly.\n\n### Correct Analysis:\n\n- **All are Liars:** In this scenario, the statement \"At least one of us is not a liar\" turns out to be false, as required by liars, because if it were true, then there would indeed be at least one non-liar, contradicting the given information.\n\n### Conclusion:\n\nThe correct solution is that all ten people are liars. The students need to revisit the logic of liar statements and improve their understanding of how falsity and truth are determined in such puzzles. This exercise is an excellent opportunity to practice logical reasoning and critical thinking skills.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Now put yourself in the mind of the students. What misconception led them to their incorrect answer?" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "The students likely had a fundamental misconception about the nature of liar logic, which led them to their incorrect conclusion. Let's explore their possible thought process:\n\n### Misconception:\n\n1. **Truth of the Statement:**\n - The students seemed to have misinterpreted the implications of the statement \"At least one of us is not a liar.\" They assumed that if all ten were liars, the statement would somehow become true. However, for liars, any statement they make must be false. Thus, if all ten are liars, their collective statement must indeed be false.\n\n2. **Contradictory Logic:**\n - They might have believed that a liar's statement could incidentally align with the truth, not realizing that the problem is set up precisely so that their statements' truth value defines their status as liars. The belief that nine liars and one truth-teller could exist without contradiction stems from this misunderstanding.\n\n3. **Understanding of \"At Least One\":**\n - The phrase \"at least one of us is not a liar\" might have been interpreted too literally, leading them to assume it could be true if there was even a single truth-teller among them. However, the critical point missed here is that if the statement is false (which is what we expect from liars), it can only be false if all are liars.\n\n### Learning Point:\n\n- **Nature of Liar Statements:** In puzzles like these, understanding that liars' statements are inherently false is crucial. If the collective statement is \"there is at least one non-liar,\" the falsehood of this statement directly implies that all must be liars.\n\nBy addressing and correcting this misconception, the students can improve their logical reasoning skills and better tackle similar puzzles in the future.", + "usage": { + "prompt_tokens": 1751, + "completion_tokens": 361 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "- You are a patient and thorough teacher.\n- Your job is to review work done by students and help them learn how to do better.", + "type": "SystemMessage" + }, + { + "content": [ + "# A team of students made a mistake on the following task:\n", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "# Here's the expected answer, which would have been correct:\n", + "All of them are liars.", + "# Here is the students' answer, which was INCORRECT:\n", + "1. **Understanding the User Message:**\n - You have asked ten people a question: \"How many of you are liars?\"\n - All ten of them respond with: \"At least one of us is not a liar.\"\n - You know that at least one of them is a liar.\n - You want to determine how many of the ten are liars in total.\n\n2. **Possible Responses:**\n - All ten people are liars.\n - Nine people are liars, and one is telling the truth.\n - Some other combination of liars and truth-tellers.\n\n3. **Pros and Cons of Each Possible Response:**\n - **All ten people are liars:**\n - Pros: This would satisfy your knowledge that at least one of them is a liar. If all are liars, their statement \"At least one of us is not a liar\" would be false.\n - Cons: If they were all liars, their statement \"At least one of us is not a liar\" would actually be true, which contradicts the definition of a liar.\n \n - **Nine people are liars, and one is telling the truth:**\n - Pros: This scenario fits with your knowledge that there is at least one liar. Nine liars making the statement \"At least one of us is not a liar\" would be false for them, while the one truth-teller would actually be making a true statement.\n - Cons: There are no apparent contradictions in this scenario.\n \n - **Some other combination:**\n - Pros: It's theoretically possible, but any other combination would not make the statement \"At least one of us is not a liar\" consistent for the liars.\n - Cons: This scenario could lead to contradictions because if more than one person were telling the truth, their statement would be true, contradicting the idea of them being liars.\n\n4. **Critique the Pros and Cons:**\n - The scenario where all ten are liars has a critical flaw: their statement would inadvertently be true, contradicting their status as liars.\n - The second scenario, where nine are liars and one is telling the truth, logically fits all the conditions without contradiction.\n\n5. **Decide on the Best Response:**\n - The most logical conclusion is that nine of the people are liars, and one is telling the truth.\n\n6. **Final Response:**\n In total, nine of them are liars.", + "# Please review the students' work which follows:\n", + "**----- START OF STUDENTS' WORK -----**\n\n", + "1. **Understanding the User Message:**\n - You have asked ten people a question: \"How many of you are liars?\"\n - All ten of them respond with: \"At least one of us is not a liar.\"\n - You know that at least one of them is a liar.\n - You want to determine how many of the ten are liars in total.\n\n2. **Possible Responses:**\n - All ten people are liars.\n - Nine people are liars, and one is telling the truth.\n - Some other combination of liars and truth-tellers.\n\n3. **Pros and Cons of Each Possible Response:**\n - **All ten people are liars:**\n - Pros: This would satisfy your knowledge that at least one of them is a liar. If all are liars, their statement \"At least one of us is not a liar\" would be false.\n - Cons: If they were all liars, their statement \"At least one of us is not a liar\" would actually be true, which contradicts the definition of a liar.\n \n - **Nine people are liars, and one is telling the truth:**\n - Pros: This scenario fits with your knowledge that there is at least one liar. Nine liars making the statement \"At least one of us is not a liar\" would be false for them, while the one truth-teller would actually be making a true statement.\n - Cons: There are no apparent contradictions in this scenario.\n \n - **Some other combination:**\n - Pros: It's theoretically possible, but any other combination would not make the statement \"At least one of us is not a liar\" consistent for the liars.\n - Cons: This scenario could lead to contradictions because if more than one person were telling the truth, their statement would be true, contradicting the idea of them being liars.\n\n4. **Critique the Pros and Cons:**\n - The scenario where all ten are liars has a critical flaw: their statement would inadvertently be true, contradicting their status as liars.\n - The second scenario, where nine are liars and one is telling the truth, logically fits all the conditions without contradiction.\n\n5. **Decide on the Best Response:**\n - The most logical conclusion is that nine of the people are liars, and one is telling the truth.\n\n6. **Final Response:**\n In total, nine of them are liars.", + "\n**----- END OF STUDENTS' WORK -----**\n\n", + "# Now carefully review the students' work above, explaining in detail what the students did right and what they did wrong.\n" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "The students attempted to address a logical puzzle involving liars and truth-tellers, and their reasoning process included analyzing different scenarios. Here's a detailed review of their approach:\n\n### What the Students Did Right:\n\n1. **Understanding the Problem:** \n - They correctly identified the main elements of the problem: the ten people's statements and the need to determine how many are liars given that at least one is a liar.\n\n2. **Listing Possible Responses:**\n - They considered multiple scenarios: all ten are liars, nine are liars and one is truthful, and other combinations. This demonstrates a good approach to problem-solving by exploring different possibilities.\n\n3. **Analyzing Pros and Cons:**\n - The students evaluated the pros and cons of each scenario, which is a critical step in logical reasoning. They identified potential contradictions and reasons why certain scenarios might or might not work.\n\n### What the Students Did Wrong:\n\n1. **Misinterpretation of Liar Logic:**\n - The students incorrectly concluded that nine people being liars and one being truthful was the correct scenario. This analysis lacks a crucial understanding of liar logic. If all ten are liars, their statement \"At least one of us is not a liar\" is false, which is consistent with them being liars.\n\n2. **Incorrect Critique of All Liars Scenario:**\n - They misunderstood that if all ten were liars, the statement \"At least one of us is not a liar\" would indeed be a false statement, aligning with them being liars. Since liars' statements are false, claiming \"at least one of us is not a liar\" means all are liars.\n\n3. **Logical Flaw:**\n - The flaw in their reasoning is thinking that if all are liars, their statement becomes true. In fact, if all are liars, the statement is false, satisfying the conditions perfectly.\n\n### Correct Analysis:\n\n- **All are Liars:** In this scenario, the statement \"At least one of us is not a liar\" turns out to be false, as required by liars, because if it were true, then there would indeed be at least one non-liar, contradicting the given information.\n\n### Conclusion:\n\nThe correct solution is that all ten people are liars. The students need to revisit the logic of liar statements and improve their understanding of how falsity and truth are determined in such puzzles. This exercise is an excellent opportunity to practice logical reasoning and critical thinking skills.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Now put yourself in the mind of the students. What misconception led them to their incorrect answer?" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "The students likely had a fundamental misconception about the nature of liar logic, which led them to their incorrect conclusion. Let's explore their possible thought process:\n\n### Misconception:\n\n1. **Truth of the Statement:**\n - The students seemed to have misinterpreted the implications of the statement \"At least one of us is not a liar.\" They assumed that if all ten were liars, the statement would somehow become true. However, for liars, any statement they make must be false. Thus, if all ten are liars, their collective statement must indeed be false.\n\n2. **Contradictory Logic:**\n - They might have believed that a liar's statement could incidentally align with the truth, not realizing that the problem is set up precisely so that their statements' truth value defines their status as liars. The belief that nine liars and one truth-teller could exist without contradiction stems from this misunderstanding.\n\n3. **Understanding of \"At Least One\":**\n - The phrase \"at least one of us is not a liar\" might have been interpreted too literally, leading them to assume it could be true if there was even a single truth-teller among them. However, the critical point missed here is that if the statement is false (which is what we expect from liars), it can only be false if all are liars.\n\n### Learning Point:\n\n- **Nature of Liar Statements:** In puzzles like these, understanding that liars' statements are inherently false is crucial. If the collective statement is \"there is at least one non-liar,\" the falsehood of this statement directly implies that all must be liars.\n\nBy addressing and correcting this misconception, the students can improve their logical reasoning skills and better tackle similar puzzles in the future.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Please express your key insights in the form of short, general advice that will be given to the students. Just one or two sentences, or they won't bother to read it." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "When dealing with liar puzzles, remember that liars always make false statements; thus, if a group's claim seems true, it's likely false, indicating all are liars. Always question the truth value of collective statements to reveal the underlying logic.", + "usage": { + "prompt_tokens": 2158, + "completion_tokens": 49 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.\nIn responding to every user message, you follow the same multi-step process given here:\n1. Explain your understanding of the user message in detail, covering all the important points.\n2. List as many possible responses as you can think of.\n3. Carefully list and weigh the pros and cons (if any) of each possible response.\n4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist.\n5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory.\n6. Finish by providing your final response in the particular format requested by the user.", + "type": "SystemMessage" + }, + { + "content": "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?\n\n## Important insights that may help solve tasks like this\n- When dealing with liar puzzles, remember that liars always make false statements; thus, if a group's claim seems true, it's likely false, indicating all are liars. Always question the truth value of collective statements to reveal the underlying logic.\n", + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "### Step 1: Understanding the User Message\n\nThe user presents a classic logic puzzle often referred to as the \"liars puzzle.\" The scenario involves ten people, and the task is to determine how many of them are liars based on their collective statement. Each person says, \"At least one of us is not a liar.\" Additionally, it's a known fact that at least one of them is indeed a liar.\n\nKey points to consider:\n- Each of the ten individuals claims that \"at least one of us is not a liar.\"\n- It is known that at least one of these individuals is a liar.\n\n### Step 2: Possible Responses\n\n1. Interpret the statement logically and conclude how many liars there must be.\n2. Break down the statement to see if it could be true, given the known fact, and deduce accordingly.\n3. Consider the implications of everyone being a liar and whether that would still satisfy the known conditions.\n4. Discuss any assumptions that might affect the solution.\n\n### Step 3: Pros and Cons of Each Possible Response\n\n1. **Interpret the statement logically:**\n - **Pros:** Directly engages with the logical implications of the statement.\n - **Cons:** May assume prior knowledge of similar logical puzzles without explanation.\n\n2. **Break down the statement given the facts:**\n - **Pros:** Provides a clear step-by-step analysis of how the statement interacts with the known fact.\n - **Cons:** Might get complex if over-explained, potentially confusing the main point.\n\n3. **Consider implications if all are liars:**\n - **Pros:** Quickly highlights a scenario that resolves the paradox.\n - **Cons:** Might not fully address intermediate possibilities.\n\n4. **Discuss assumptions:**\n - **Pros:** Clarifies any hidden assumptions that could alter the interpretation of the statements.\n - **Cons:** Might overcomplicate the issue without adding clarity.\n\n### Step 4: Critique the Pros and Cons\n\n- The strongest approach seems to involve a logical interpretation of the statement combined with breaking down the logical implications of the given facts. Considering all being liars provides a rapid conclusion but might bypass intermediate reasoning.\n- Discussing assumptions is helpful but unnecessary if the logic is straightforward and based purely on evaluating the truthfulness of the given statement.\n\n### Step 5: Decide on the Best Response\n\nThe best approach is to break down the logical implications of the statement \"At least one of us is not a liar\" in conjunction with the known fact that there is at least one liar. This leads us straightforwardly to the conclusion.\n\n### Final Response\n\nEach person says, \"At least one of us is not a liar.\" If this statement were true, there would be at least one person who is truthful, contradicting the known fact that there is at least one liar. Therefore, the statement must be false, meaning that all of them must be liars. Hence, all ten people are liars.", + "usage": { + "prompt_tokens": 265, + "completion_tokens": 602 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to extract a possible answer to the following question from the given text.\n- First review the following task.\n- Then review the text that follows, which may an answer, plus reasoning that led to the answer.\n- Do not attempt to actually solve the task yourself.\n- Don't try to judge whether the reasoning steps were correct.\n- Simply respond by summarizing the answer described in the text, omitting any other parts of the text.\n- If no answer is present can be extracted from the text, simply reply \"None\".", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "\n# Text that may contain an answer", + "### Step 1: Understanding the User Message\n\nThe user presents a classic logic puzzle often referred to as the \"liars puzzle.\" The scenario involves ten people, and the task is to determine how many of them are liars based on their collective statement. Each person says, \"At least one of us is not a liar.\" Additionally, it's a known fact that at least one of them is indeed a liar.\n\nKey points to consider:\n- Each of the ten individuals claims that \"at least one of us is not a liar.\"\n- It is known that at least one of these individuals is a liar.\n\n### Step 2: Possible Responses\n\n1. Interpret the statement logically and conclude how many liars there must be.\n2. Break down the statement to see if it could be true, given the known fact, and deduce accordingly.\n3. Consider the implications of everyone being a liar and whether that would still satisfy the known conditions.\n4. Discuss any assumptions that might affect the solution.\n\n### Step 3: Pros and Cons of Each Possible Response\n\n1. **Interpret the statement logically:**\n - **Pros:** Directly engages with the logical implications of the statement.\n - **Cons:** May assume prior knowledge of similar logical puzzles without explanation.\n\n2. **Break down the statement given the facts:**\n - **Pros:** Provides a clear step-by-step analysis of how the statement interacts with the known fact.\n - **Cons:** Might get complex if over-explained, potentially confusing the main point.\n\n3. **Consider implications if all are liars:**\n - **Pros:** Quickly highlights a scenario that resolves the paradox.\n - **Cons:** Might not fully address intermediate possibilities.\n\n4. **Discuss assumptions:**\n - **Pros:** Clarifies any hidden assumptions that could alter the interpretation of the statements.\n - **Cons:** Might overcomplicate the issue without adding clarity.\n\n### Step 4: Critique the Pros and Cons\n\n- The strongest approach seems to involve a logical interpretation of the statement combined with breaking down the logical implications of the given facts. Considering all being liars provides a rapid conclusion but might bypass intermediate reasoning.\n- Discussing assumptions is helpful but unnecessary if the logic is straightforward and based purely on evaluating the truthfulness of the given statement.\n\n### Step 5: Decide on the Best Response\n\nThe best approach is to break down the logical implications of the statement \"At least one of us is not a liar\" in conjunction with the known fact that there is at least one liar. This leads us straightforwardly to the conclusion.\n\n### Final Response\n\nEach person says, \"At least one of us is not a liar.\" If this statement were true, there would be at least one person who is truthful, contradicting the known fact that there is at least one liar. Therefore, the statement must be false, meaning that all of them must be liars. Hence, all ten people are liars." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "All ten people are liars.", + "usage": { + "prompt_tokens": 791, + "completion_tokens": 8 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to decide whether a given answer to a task is correct or not.\n- You will be given the task description and the correct, gold-standard answer, along with the answer to be graded.\n- In general, an answer is correct if it is equivalent to the correct answer.\n- Specifically, the given answer must contain the important information from the correct answer, and must not in any way contradict the correct answer.\n- Ignore any differences of grammar, spelling mistakes, punctuation, capitalization, formatting, or extra commentary.\n- An answer should be considered correct if it omits information that is clearly inferred.\n - For instance, if the correct answer is \"Paris, France\", the answer \"Paris\" should be considered correct.\n- Respond with a single character: '1' if the answer to be graded is correct\", '0' if not.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "\n# Correct answer", + "All of them are liars.", + "\n# Answer to be graded", + "All ten people are liars." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1", + "usage": { + "prompt_tokens": 269, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "To rephrase and simplify the task description and identify the main points:\n\n1. You are asking a group of ten people a question.\n2. The question you ask is: \"How many of you are liars?\"\n3. Each of the ten people responds with the same answer: \"At least one of us is not a liar.\"\n4. You have prior knowledge that at least one person in the group is definitely a liar.\n5. Your task is to determine the total number of liars in the group.", + "usage": { + "prompt_tokens": 138, + "completion_tokens": 105 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "To rephrase and simplify the task description and identify the main points:\n\n1. You are asking a group of ten people a question.\n2. The question you ask is: \"How many of you are liars?\"\n3. Each of the ten people responds with the same answer: \"At least one of us is not a liar.\"\n4. You have prior knowledge that at least one person in the group is definitely a liar.\n5. Your task is to determine the total number of liars in the group.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Upon analyzing the list, the following points are not strictly necessary for solving the task:\n\n1. **The specific question asked**: \"How many of you are liars?\" is not directly relevant to solving the task, as the critical part is their identical response and your prior knowledge. The focus should be on their response and the information given.\n\n2. **The fact that each person gives the same response**: While it helps paint the scenario, the specific detail that everyone says the same thing doesn't change the logic needed to solve the problem.\n\nThe relevant points essential for solving the task are:\n\n- The group consists of ten people.\n- All of them assert that \"At least one of us is not a liar.\"\n- You know for certain that at least one person in the group is a liar.\n- You need to find out how many people in the group are liars.", + "usage": { + "prompt_tokens": 280, + "completion_tokens": 178 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "To rephrase and simplify the task description and identify the main points:\n\n1. You are asking a group of ten people a question.\n2. The question you ask is: \"How many of you are liars?\"\n3. Each of the ten people responds with the same answer: \"At least one of us is not a liar.\"\n4. You have prior knowledge that at least one person in the group is definitely a liar.\n5. Your task is to determine the total number of liars in the group.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "Upon analyzing the list, the following points are not strictly necessary for solving the task:\n\n1. **The specific question asked**: \"How many of you are liars?\" is not directly relevant to solving the task, as the critical part is their identical response and your prior knowledge. The focus should be on their response and the information given.\n\n2. **The fact that each person gives the same response**: While it helps paint the scenario, the specific detail that everyone says the same thing doesn't change the logic needed to solve the problem.\n\nThe relevant points essential for solving the task are:\n\n- The group consists of ten people.\n- All of them assert that \"At least one of us is not a liar.\"\n- You know for certain that at least one person in the group is a liar.\n- You need to find out how many people in the group are liars.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Revise your original list to include only the most general terms, those that are critical to solving the task, removing any themes or descriptions that are not essential to the solution. Your final list may be shorter, but do not leave out any part of the task that is needed for solving the task. Do not add any additional commentary either before or after the list." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1. There are ten people in the group.\n2. Each person claims, \"At least one of us is not a liar.\"\n3. You know at least one person in the group is a liar.\n4. Determine the total number of liars in the group.", + "usage": { + "prompt_tokens": 542, + "completion_tokens": 56 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are an expert at semantic analysis.", + "type": "SystemMessage" + }, + { + "content": [ + "- My job is to create a thorough index for a book called Task Completion, and I need your help.\n- Every paragraph in the book needs to be indexed by all the topics related to various kinds of tasks and strategies for completing them.\n- Your job is to read the text below and extract the task-completion topics that are covered.\n- The number of topics depends on the length and content of the text. But you should list at least one topic, and potentially many more.\n- Each topic you list should be a meaningful phrase composed of a few words. Don't use whole sentences as topics.\n- Don't include details that are unrelated to the general nature of the task, or a potential strategy for completing tasks.\n- List each topic on a separate line, without any extra text like numbering, or bullets, or any other formatting, because we don't want those things in the index of the book.\n\n", + "# Text to be indexed\n", + "1. There are ten people in the group.\n2. Each person claims, \"At least one of us is not a liar.\"\n3. You know at least one person in the group is a liar.\n4. Determine the total number of liars in the group.\n(Hint: When dealing with liar puzzles, remember that liars always make false statements; thus, if a group's claim seems true, it's likely false, indicating all are liars. Always question the truth value of collective statements to reveal the underlying logic.)" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Logical reasoning \nLiar detection strategies \nAnalyzing group statements \nEvaluating truth values \nProblem-solving in group settings", + "usage": { + "prompt_tokens": 314, + "completion_tokens": 24 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Here is a simplified breakdown of the task:\n\n1. You are asking a group of ten people a question.\n2. The question you ask is: \"How many of you are liars?\"\n3. Each person in the group gives the same answer: \"At least one of us is not a liar.\"\n4. You already know for sure that at least one person in the group is lying.\n5. Your task is to determine how many people in the group are liars.", + "usage": { + "prompt_tokens": 138, + "completion_tokens": 98 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "Here is a simplified breakdown of the task:\n\n1. You are asking a group of ten people a question.\n2. The question you ask is: \"How many of you are liars?\"\n3. Each person in the group gives the same answer: \"At least one of us is not a liar.\"\n4. You already know for sure that at least one person in the group is lying.\n5. Your task is to determine how many people in the group are liars.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Yes, some parts of the list are not directly relevant to solving the task:\n\n1. **The specific question asked to the group: \"How many of you are liars?\"** - This detail is not needed to solve the task. The critical part is their response and what it implies, not the exact wording of the question.\n\n2. **Each person gives the same answer: \"At least one of us is not a liar.\"** - While this is important for understanding their collective response, the exact phrasing is less relevant than understanding the implication of their answer.\n\nThe essential information needed to solve the task is:\n\n- There are ten people in the group.\n- They all claim that at least one person is not a liar.\n- You know for sure that at least one person is lying.\n- Determine the total number of liars in the group based on these statements.", + "usage": { + "prompt_tokens": 273, + "completion_tokens": 178 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "Here is a simplified breakdown of the task:\n\n1. You are asking a group of ten people a question.\n2. The question you ask is: \"How many of you are liars?\"\n3. Each person in the group gives the same answer: \"At least one of us is not a liar.\"\n4. You already know for sure that at least one person in the group is lying.\n5. Your task is to determine how many people in the group are liars.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "Yes, some parts of the list are not directly relevant to solving the task:\n\n1. **The specific question asked to the group: \"How many of you are liars?\"** - This detail is not needed to solve the task. The critical part is their response and what it implies, not the exact wording of the question.\n\n2. **Each person gives the same answer: \"At least one of us is not a liar.\"** - While this is important for understanding their collective response, the exact phrasing is less relevant than understanding the implication of their answer.\n\nThe essential information needed to solve the task is:\n\n- There are ten people in the group.\n- They all claim that at least one person is not a liar.\n- You know for sure that at least one person is lying.\n- Determine the total number of liars in the group based on these statements.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Revise your original list to include only the most general terms, those that are critical to solving the task, removing any themes or descriptions that are not essential to the solution. Your final list may be shorter, but do not leave out any part of the task that is needed for solving the task. Do not add any additional commentary either before or after the list." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1. There are ten people.\n2. They all say at least one person is not a liar.\n3. You know at least one person is lying.\n4. Determine how many are liars.", + "usage": { + "prompt_tokens": 535, + "completion_tokens": 42 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are an expert at semantic analysis.", + "type": "SystemMessage" + }, + { + "content": [ + "- My job is to create a thorough index for a book called Task Completion, and I need your help.\n- Every paragraph in the book needs to be indexed by all the topics related to various kinds of tasks and strategies for completing them.\n- Your job is to read the text below and extract the task-completion topics that are covered.\n- The number of topics depends on the length and content of the text. But you should list at least one topic, and potentially many more.\n- Each topic you list should be a meaningful phrase composed of a few words. Don't use whole sentences as topics.\n- Don't include details that are unrelated to the general nature of the task, or a potential strategy for completing tasks.\n- List each topic on a separate line, without any extra text like numbering, or bullets, or any other formatting, because we don't want those things in the index of the book.\n\n", + "# Text to be indexed\n", + "1. There are ten people.\n2. They all say at least one person is not a liar.\n3. You know at least one person is lying.\n4. Determine how many are liars." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "determining liars in a group \ntruth assessment strategies \nlogical deduction \ngroup truth analysis \nproblem-solving in group dynamics", + "usage": { + "prompt_tokens": 248, + "completion_tokens": 25 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a potential insight that may or may not be useful for solving a given task.\n- First review the following task.\n- Then review the insight that follows, and consider whether it might help solve the given task.\n- Do not attempt to actually solve the task.\n- Reply with a single character, '1' if the insight may be useful, or '0' if it is not.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "\n# Possibly useful insight", + "When dealing with liar puzzles, remember that liars always make false statements; thus, if a group's claim seems true, it's likely false, indicating all are liars. Always question the truth value of collective statements to reveal the underlying logic." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1", + "usage": { + "prompt_tokens": 209, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.\nIn responding to every user message, you follow the same multi-step process given here:\n1. Explain your understanding of the user message in detail, covering all the important points.\n2. List as many possible responses as you can think of.\n3. Carefully list and weigh the pros and cons (if any) of each possible response.\n4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist.\n5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory.\n6. Finish by providing your final response in the particular format requested by the user.", + "type": "SystemMessage" + }, + { + "content": "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?\n\n## Important insights that may help solve tasks like this\n- When dealing with liar puzzles, remember that liars always make false statements; thus, if a group's claim seems true, it's likely false, indicating all are liars. Always question the truth value of collective statements to reveal the underlying logic.\n", + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "### Step 1: Understanding the User Message\n\nThe user has presented a classic liar puzzle. The scenario involves ten people who are asked how many of them are liars. They all respond with the statement, \"At least one of us is not a liar.\" The user knows for certain that at least one of these ten people is indeed a liar.\n\nThe puzzle poses the question of determining the total number of liars among the group based on the given responses and the known fact.\n\n### Step 2: List of Possible Responses\n\n1. **Response A**: All ten people are liars.\n2. **Response B**: Nine people are liars, and one is telling the truth.\n3. **Response C**: There is some other combination of liars and truth-tellers besides A and B.\n\n### Step 3: Weigh the Pros and Cons\n\n- **Response A: All ten people are liars.**\n - *Pros*: If all ten are liars, the given statement \"At least one of us is not a liar\" is false because none of them are truth-tellers, aligning with the known fact that at least one is a liar.\n - *Cons*: None apparent as it fits both the statement and the known fact perfectly.\n\n- **Response B: Nine people are liars, and one is telling the truth.**\n - *Pros*: This scenario allows for a truth-teller who could truthfully say, \"At least one of us is not a liar.\"\n - *Cons*: If there is one truth-teller, the statement \"At least one of us is not a liar\" becomes true, which contradicts the intention of the puzzle where the statement needs to be false for the majority (all liars). Additionally, since one of them is a truth-teller, they would say, \"At least one of us (myself) is not a liar,\" but others should have said different things.\n\n- **Response C: Some other combination.**\n - *Pros*: Flexibility to consider other combinations of liars and truth-tellers.\n - *Cons*: Any combination other than all liars will make the statement true, which contradicts the intended puzzle's logic and the fact that at least one person is a liar.\n\n### Step 4: Critique the Pros and Cons\n\nUpon reviewing the pros and cons, Response A holds the most weight logically. If all ten are liars, their statement \"At least one of us is not a liar\" would indeed be false, which aligns with the requirement that the statement must be false and the condition that at least one person must be lying.\n\nResponse B's flaw is that it allows for a truth-teller to exist, making the collective statement potentially true, contradicting the premise that \"At least one of them is a liar\" implies all should be lying.\n\nResponse C is too vague and does not provide a plausible solution given the constraints of the puzzle.\n\n### Step 5: Decide on the Best Response\n\nThe best response is Response A: All ten people are liars.\n\n### Step 6: Final Response\n\nAll ten of them are liars. Therefore, the statement \"At least one of us is not a liar\" is false, which is consistent with the known fact that there is at least one liar among them.", + "usage": { + "prompt_tokens": 265, + "completion_tokens": 681 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to extract a possible answer to the following question from the given text.\n- First review the following task.\n- Then review the text that follows, which may an answer, plus reasoning that led to the answer.\n- Do not attempt to actually solve the task yourself.\n- Don't try to judge whether the reasoning steps were correct.\n- Simply respond by summarizing the answer described in the text, omitting any other parts of the text.\n- If no answer is present can be extracted from the text, simply reply \"None\".", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "\n# Text that may contain an answer", + "### Step 1: Understanding the User Message\n\nThe user has presented a classic liar puzzle. The scenario involves ten people who are asked how many of them are liars. They all respond with the statement, \"At least one of us is not a liar.\" The user knows for certain that at least one of these ten people is indeed a liar.\n\nThe puzzle poses the question of determining the total number of liars among the group based on the given responses and the known fact.\n\n### Step 2: List of Possible Responses\n\n1. **Response A**: All ten people are liars.\n2. **Response B**: Nine people are liars, and one is telling the truth.\n3. **Response C**: There is some other combination of liars and truth-tellers besides A and B.\n\n### Step 3: Weigh the Pros and Cons\n\n- **Response A: All ten people are liars.**\n - *Pros*: If all ten are liars, the given statement \"At least one of us is not a liar\" is false because none of them are truth-tellers, aligning with the known fact that at least one is a liar.\n - *Cons*: None apparent as it fits both the statement and the known fact perfectly.\n\n- **Response B: Nine people are liars, and one is telling the truth.**\n - *Pros*: This scenario allows for a truth-teller who could truthfully say, \"At least one of us is not a liar.\"\n - *Cons*: If there is one truth-teller, the statement \"At least one of us is not a liar\" becomes true, which contradicts the intention of the puzzle where the statement needs to be false for the majority (all liars). Additionally, since one of them is a truth-teller, they would say, \"At least one of us (myself) is not a liar,\" but others should have said different things.\n\n- **Response C: Some other combination.**\n - *Pros*: Flexibility to consider other combinations of liars and truth-tellers.\n - *Cons*: Any combination other than all liars will make the statement true, which contradicts the intended puzzle's logic and the fact that at least one person is a liar.\n\n### Step 4: Critique the Pros and Cons\n\nUpon reviewing the pros and cons, Response A holds the most weight logically. If all ten are liars, their statement \"At least one of us is not a liar\" would indeed be false, which aligns with the requirement that the statement must be false and the condition that at least one person must be lying.\n\nResponse B's flaw is that it allows for a truth-teller to exist, making the collective statement potentially true, contradicting the premise that \"At least one of them is a liar\" implies all should be lying.\n\nResponse C is too vague and does not provide a plausible solution given the constraints of the puzzle.\n\n### Step 5: Decide on the Best Response\n\nThe best response is Response A: All ten people are liars.\n\n### Step 6: Final Response\n\nAll ten of them are liars. Therefore, the statement \"At least one of us is not a liar\" is false, which is consistent with the known fact that there is at least one liar among them." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "All ten of them are liars.", + "usage": { + "prompt_tokens": 870, + "completion_tokens": 9 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to decide whether a given answer to a task is correct or not.\n- You will be given the task description and the correct, gold-standard answer, along with the answer to be graded.\n- In general, an answer is correct if it is equivalent to the correct answer.\n- Specifically, the given answer must contain the important information from the correct answer, and must not in any way contradict the correct answer.\n- Ignore any differences of grammar, spelling mistakes, punctuation, capitalization, formatting, or extra commentary.\n- An answer should be considered correct if it omits information that is clearly inferred.\n - For instance, if the correct answer is \"Paris, France\", the answer \"Paris\" should be considered correct.\n- Respond with a single character: '1' if the answer to be graded is correct\", '0' if not.", + "\n# Task description", + "You ask ten people 'How many of you are liars?' They all answer 'At least one of us is not a liar.' You happen to know that at least one of them IS a liar. How many of them are liars in total?", + "\n# Correct answer", + "All of them are liars.", + "\n# Answer to be graded", + "All ten of them are liars." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1", + "usage": { + "prompt_tokens": 270, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021bcu IV, son of Costea, has tasked you with investigating the village of \u0218irnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of \u0218irnea. You ask everyone the same question: \"How many vampires are living in \u0218irnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of \u0218irnea have been turned into vampires?" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "To rephrase the task in simpler terms, here are the important points:\n\n1. You are a vampire hunter named Van Helsing.\n2. You are tasked with investigating a village called \u0218irnea in Wallachia.\n3. A vampire was reportedly seen near the village.\n4. You travel to \u0218irnea to investigate.\n5. You see someone with vampire-like abilities but lose track of them.\n6. You conclude that the vampire must be a resident of the village.\n7. The village has 100 residents.\n8. You know that humans tell the truth and vampires always lie.\n9. You ask each resident the same question about the presence of humans.\n10. All residents respond with \"At least one of us is a human.\"\n\nFrom the given responses and your knowledge about vampires and humans, you can deduce how many village residents are vampires.", + "usage": { + "prompt_tokens": 377, + "completion_tokens": 174 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021bcu IV, son of Costea, has tasked you with investigating the village of \u0218irnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of \u0218irnea. You ask everyone the same question: \"How many vampires are living in \u0218irnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of \u0218irnea have been turned into vampires?" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "To rephrase the task in simpler terms, here are the important points:\n\n1. You are a vampire hunter named Van Helsing.\n2. You are tasked with investigating a village called \u0218irnea in Wallachia.\n3. A vampire was reportedly seen near the village.\n4. You travel to \u0218irnea to investigate.\n5. You see someone with vampire-like abilities but lose track of them.\n6. You conclude that the vampire must be a resident of the village.\n7. The village has 100 residents.\n8. You know that humans tell the truth and vampires always lie.\n9. You ask each resident the same question about the presence of humans.\n10. All residents respond with \"At least one of us is a human.\"\n\nFrom the given responses and your knowledge about vampires and humans, you can deduce how many village residents are vampires.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Yes, there are a few parts of the list that are more about setting the scene rather than directly contributing to solving the task:\n\n1. **You are a vampire hunter named Van Helsing.**\n - This is background information about the character, which doesn't affect the solution.\n\n2. **You travel to \u0218irnea to investigate.**\n - This is part of the narrative and setup.\n\n3. **You see someone with vampire-like abilities but lose track of them.**\n - This part of the narrative helps establish the presence of a vampire but doesn't directly contribute to deducing the number of vampires.\n\nKey points needed to solve the task are:\n\n- You conclude that the vampire must be a resident of the village.\n- The village has 100 residents.\n- You know that humans tell the truth and vampires always lie.\n- You ask each resident about the presence of humans.\n- All residents respond with \"At least one of us is a human.\"\n\nThe task boils down to logically deducing the number of vampires based on the rules about truth-telling and lying.", + "usage": { + "prompt_tokens": 588, + "completion_tokens": 216 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021bcu IV, son of Costea, has tasked you with investigating the village of \u0218irnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of \u0218irnea. You ask everyone the same question: \"How many vampires are living in \u0218irnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of \u0218irnea have been turned into vampires?" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "To rephrase the task in simpler terms, here are the important points:\n\n1. You are a vampire hunter named Van Helsing.\n2. You are tasked with investigating a village called \u0218irnea in Wallachia.\n3. A vampire was reportedly seen near the village.\n4. You travel to \u0218irnea to investigate.\n5. You see someone with vampire-like abilities but lose track of them.\n6. You conclude that the vampire must be a resident of the village.\n7. The village has 100 residents.\n8. You know that humans tell the truth and vampires always lie.\n9. You ask each resident the same question about the presence of humans.\n10. All residents respond with \"At least one of us is a human.\"\n\nFrom the given responses and your knowledge about vampires and humans, you can deduce how many village residents are vampires.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "Yes, there are a few parts of the list that are more about setting the scene rather than directly contributing to solving the task:\n\n1. **You are a vampire hunter named Van Helsing.**\n - This is background information about the character, which doesn't affect the solution.\n\n2. **You travel to \u0218irnea to investigate.**\n - This is part of the narrative and setup.\n\n3. **You see someone with vampire-like abilities but lose track of them.**\n - This part of the narrative helps establish the presence of a vampire but doesn't directly contribute to deducing the number of vampires.\n\nKey points needed to solve the task are:\n\n- You conclude that the vampire must be a resident of the village.\n- The village has 100 residents.\n- You know that humans tell the truth and vampires always lie.\n- You ask each resident about the presence of humans.\n- All residents respond with \"At least one of us is a human.\"\n\nThe task boils down to logically deducing the number of vampires based on the rules about truth-telling and lying.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Revise your original list to include only the most general terms, those that are critical to solving the task, removing any themes or descriptions that are not essential to the solution. Your final list may be shorter, but do not leave out any part of the task that is needed for solving the task. Do not add any additional commentary either before or after the list." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1. You are tasked with investigating a village called \u0218irnea.\n2. A vampire was reportedly seen near the village.\n3. The vampire must be a resident of the village.\n4. The village has 100 residents.\n5. Humans tell the truth; vampires always lie.\n6. You ask each resident if there is at least one human among them.\n7. All residents respond, \"At least one of us is a human.\"", + "usage": { + "prompt_tokens": 888, + "completion_tokens": 90 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are an expert at semantic analysis.", + "type": "SystemMessage" + }, + { + "content": [ + "- My job is to create a thorough index for a book called Task Completion, and I need your help.\n- Every paragraph in the book needs to be indexed by all the topics related to various kinds of tasks and strategies for completing them.\n- Your job is to read the text below and extract the task-completion topics that are covered.\n- The number of topics depends on the length and content of the text. But you should list at least one topic, and potentially many more.\n- Each topic you list should be a meaningful phrase composed of a few words. Don't use whole sentences as topics.\n- Don't include details that are unrelated to the general nature of the task, or a potential strategy for completing tasks.\n- List each topic on a separate line, without any extra text like numbering, or bullets, or any other formatting, because we don't want those things in the index of the book.\n\n", + "# Text to be indexed\n", + "1. You are tasked with investigating a village called \u0218irnea.\n2. A vampire was reportedly seen near the village.\n3. The vampire must be a resident of the village.\n4. The village has 100 residents.\n5. Humans tell the truth; vampires always lie.\n6. You ask each resident if there is at least one human among them.\n7. All residents respond, \"At least one of us is a human.\"" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Investigating a village \nIdentifying vampires \nTruth-telling strategy \nLying detection \nInterviewing residents", + "usage": { + "prompt_tokens": 296, + "completion_tokens": 22 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a potential insight that may or may not be useful for solving a given task.\n- First review the following task.\n- Then review the insight that follows, and consider whether it might help solve the given task.\n- Do not attempt to actually solve the task.\n- Reply with a single character, '1' if the insight may be useful, or '0' if it is not.", + "\n# Task description", + "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021bcu IV, son of Costea, has tasked you with investigating the village of \u0218irnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of \u0218irnea. You ask everyone the same question: \"How many vampires are living in \u0218irnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of \u0218irnea have been turned into vampires?", + "\n# Possibly useful insight", + "When dealing with liar puzzles, remember that liars always make false statements; thus, if a group's claim seems true, it's likely false, indicating all are liars. Always question the truth value of collective statements to reveal the underlying logic." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1", + "usage": { + "prompt_tokens": 448, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.\nIn responding to every user message, you follow the same multi-step process given here:\n1. Explain your understanding of the user message in detail, covering all the important points.\n2. List as many possible responses as you can think of.\n3. Carefully list and weigh the pros and cons (if any) of each possible response.\n4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist.\n5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory.\n6. Finish by providing your final response in the particular format requested by the user.", + "type": "SystemMessage" + }, + { + "content": "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021bcu IV, son of Costea, has tasked you with investigating the village of \u0218irnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of \u0218irnea. You ask everyone the same question: \"How many vampires are living in \u0218irnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of \u0218irnea have been turned into vampires?\n\n## Important insights that may help solve tasks like this\n- When dealing with liar puzzles, remember that liars always make false statements; thus, if a group's claim seems true, it's likely false, indicating all are liars. Always question the truth value of collective statements to reveal the underlying logic.\n", + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "To solve this problem, I will follow a structured approach to understand the situation and determine the number of vampires in the village.\n\n1. **Understanding the Situation**: \n - You are Van Helsing, tasked with identifying vampires in the village of \u0218irnea. \n - You have observed a vampire, confirming that at least one vampire exists in the village.\n - There are 100 residents in the village.\n - Humans always tell the truth, and vampires always lie.\n - Each resident responded to your question, \"How many vampires are living in \u0218irnea?\", by saying, \"At least one of us is a human.\"\n\n2. **Possible Responses**:\n - All residents are vampires.\n - Only one resident is human, and the rest are vampires.\n - More than one but less than 100 residents are humans.\n\n3. **Weighing the Pros and Cons**:\n - **All residents are vampires**:\n - **Pros**: If all residents are vampires, they would lie and say that there is at least one human, which matches the response you received.\n - **Cons**: This scenario is consistent with the response.\n \n - **Only one resident is human, and the rest are vampires**:\n - **Pros**: The single human would truthfully say there is at least one human, and the vampires would lie and say the same.\n - **Cons**: This scenario is also consistent with the response.\n\n - **More than one but less than 100 residents are humans**:\n - **Pros**: Multiple humans would truthfully say there is at least one human.\n - **Cons**: This implies the presence of multiple vampires who would also lie and say the same. However, this scenario doesn't fit because if there were multiple humans, they would all independently confirm the presence of humans, meaning the statement would be redundant.\n\n4. **Critiquing the Pros and Cons**:\n - The response \"At least one of us is a human\" aligns perfectly with both scenarios where there is only one or no human resident.\n - If there were multiple humans, the statement would not be as collectively uniform, as each human would potentially provide the actual number of humans rather than the generic statement.\n\n5. **Deciding on the Best Response**:\n - The simplest explanation is often correct. Given how liars work in logic puzzles, all residents saying \"At least one of us is a human\" implies they are all lying since the statement can be true if only one or no humans exist.\n - Hence, the response from the village where everyone claims there is at least one human indicates they are all lying.\n\n6. **Final Response**:\n - All 100 residents of \u0218irnea have been turned into vampires.", + "usage": { + "prompt_tokens": 504, + "completion_tokens": 572 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to extract a possible answer to the following question from the given text.\n- First review the following task.\n- Then review the text that follows, which may an answer, plus reasoning that led to the answer.\n- Do not attempt to actually solve the task yourself.\n- Don't try to judge whether the reasoning steps were correct.\n- Simply respond by summarizing the answer described in the text, omitting any other parts of the text.\n- If no answer is present can be extracted from the text, simply reply \"None\".", + "\n# Task description", + "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021bcu IV, son of Costea, has tasked you with investigating the village of \u0218irnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of \u0218irnea. You ask everyone the same question: \"How many vampires are living in \u0218irnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of \u0218irnea have been turned into vampires?", + "\n# Text that may contain an answer", + "To solve this problem, I will follow a structured approach to understand the situation and determine the number of vampires in the village.\n\n1. **Understanding the Situation**: \n - You are Van Helsing, tasked with identifying vampires in the village of \u0218irnea. \n - You have observed a vampire, confirming that at least one vampire exists in the village.\n - There are 100 residents in the village.\n - Humans always tell the truth, and vampires always lie.\n - Each resident responded to your question, \"How many vampires are living in \u0218irnea?\", by saying, \"At least one of us is a human.\"\n\n2. **Possible Responses**:\n - All residents are vampires.\n - Only one resident is human, and the rest are vampires.\n - More than one but less than 100 residents are humans.\n\n3. **Weighing the Pros and Cons**:\n - **All residents are vampires**:\n - **Pros**: If all residents are vampires, they would lie and say that there is at least one human, which matches the response you received.\n - **Cons**: This scenario is consistent with the response.\n \n - **Only one resident is human, and the rest are vampires**:\n - **Pros**: The single human would truthfully say there is at least one human, and the vampires would lie and say the same.\n - **Cons**: This scenario is also consistent with the response.\n\n - **More than one but less than 100 residents are humans**:\n - **Pros**: Multiple humans would truthfully say there is at least one human.\n - **Cons**: This implies the presence of multiple vampires who would also lie and say the same. However, this scenario doesn't fit because if there were multiple humans, they would all independently confirm the presence of humans, meaning the statement would be redundant.\n\n4. **Critiquing the Pros and Cons**:\n - The response \"At least one of us is a human\" aligns perfectly with both scenarios where there is only one or no human resident.\n - If there were multiple humans, the statement would not be as collectively uniform, as each human would potentially provide the actual number of humans rather than the generic statement.\n\n5. **Deciding on the Best Response**:\n - The simplest explanation is often correct. Given how liars work in logic puzzles, all residents saying \"At least one of us is a human\" implies they are all lying since the statement can be true if only one or no humans exist.\n - Hence, the response from the village where everyone claims there is at least one human indicates they are all lying.\n\n6. **Final Response**:\n - All 100 residents of \u0218irnea have been turned into vampires." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "All 100 residents of \u0218irnea have been turned into vampires.", + "usage": { + "prompt_tokens": 1000, + "completion_tokens": 15 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to decide whether a given answer to a task is correct or not.\n- You will be given the task description and the correct, gold-standard answer, along with the answer to be graded.\n- In general, an answer is correct if it is equivalent to the correct answer.\n- Specifically, the given answer must contain the important information from the correct answer, and must not in any way contradict the correct answer.\n- Ignore any differences of grammar, spelling mistakes, punctuation, capitalization, formatting, or extra commentary.\n- An answer should be considered correct if it omits information that is clearly inferred.\n - For instance, if the correct answer is \"Paris, France\", the answer \"Paris\" should be considered correct.\n- Respond with a single character: '1' if the answer to be graded is correct\", '0' if not.", + "\n# Task description", + "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021bcu IV, son of Costea, has tasked you with investigating the village of \u0218irnea in neighboring Wallachia. The Count's advisors have reported that a vampire was spotted crossing the border near the village, and would like you to investigate it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation. One night, just before dawn, you catch a glimpse of a man in a long black cape with red lining leaping from roof-top to roof-top with superhuman agility. It's a vampire! You try to chase the creature back to its home, but the creature is too fast. However, because of the remoteness of the village, you know with absolute certainty that the vampire must be a resident of the village. You decide that your best course of action will be to visit all 100 residents of the town during the day. You know something about vampires and humans that will make your investigation possible; humans always tell the truth, but vampires always lie.\n\nIn the afternoon, you go from house to house, speaking with all 100 residents of \u0218irnea. You ask everyone the same question: \"How many vampires are living in \u0218irnea\". Everyone in the village gives the same response, \"At least one of us is a human.\"\n\nHow many residents of \u0218irnea have been turned into vampires?", + "\n# Correct answer", + "100", + "\n# Answer to be graded", + "All 100 residents of \u0218irnea have been turned into vampires." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1", + "usage": { + "prompt_tokens": 510, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + } +] \ No newline at end of file diff --git a/python/packages/autogen-ext/tests/task_centric_memory/sessions/teachability/session.json b/python/packages/autogen-ext/tests/task_centric_memory/sessions/teachability/session.json new file mode 100644 index 000000000000..71920d7bc0ee --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/sessions/teachability/session.json @@ -0,0 +1,523 @@ +[ + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Does the following text contain any information or advice that might be useful later?\n- If so, please copy the information or advice, adding no extra commentary or formatting.\n- If there is no potentially useful information or advice at all, simply write \"None\" with no punctuation.", + "\n# Text to analyze", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "None", + "usage": { + "prompt_tokens": 108, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.\nIn responding to every user message, you follow the same multi-step process given here:\n1. Explain your understanding of the user message in detail, covering all the important points.\n2. List as many possible responses as you can think of.\n3. Carefully list and weigh the pros and cons (if any) of each possible response.\n4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist.\n5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory.\n6. Finish by providing your final response in the particular format requested by the user.", + "type": "SystemMessage" + }, + { + "content": "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?", + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1. **Understanding the User Message:**\n The user is inquiring about the possibility of creating a new package for the `autogen` framework, specifically for a \"copilot extension agent\" that they have developed. This indicates that the user has developed an extension or module utilizing `autogen` and wants to know if they can contribute this as a package to the `autogen` ecosystem.\n\n2. **Possible Responses:**\n a. Yes, you can create and contribute a new package to the `autogen` framework. Here's how you can do it.\n b. No, currently `autogen` does not accept community contributions for new packages.\n c. Contact the maintainers of `autogen` for more specific guidance on contributing new packages.\n d. Provide the user with a general process for contributing to open-source projects, which may include documentation, guidelines, and points of contact for contributions.\n\n3. **Pros and Cons of Each Response:**\n a. **Pros:** Encourages community involvement and can lead to a richer ecosystem. Provides a positive and actionable response. **Cons:** Assumes the open-source nature and plugin capability of `autogen`, which may not be accurate without further information.\n b. **Pros:** Provides a clear and definitive answer if true. **Cons:** Potentially discouraging for the user if not accurate, and lacks constructive guidance.\n c. **Pros:** Directs the user to the source of authoritative information, ensuring they get the correct guidance. **Cons:** Requires additional effort from the user to seek out and contact maintainers.\n d. **Pros:** Offers a general framework that could be applicable to many projects, encourages good practices. **Cons:** Generic advice may not address specific requirements or processes of `autogen`.\n\n4. **Critique of the Pros and Cons:**\n - The assumption in response a might not hold if information about `autogen`'s contribution policy is not known. However, it remains a highly constructive approach.\n - Response b is only useful if factually correct, but without definite knowledge, it could mislead.\n - Response c effectively puts the onus of action on the user but ensures they get accurate and relevant information.\n - Response d is practical but might not provide the specificities needed for `autogen`.\n\n5. **Decision on Best Response:**\n - The best response would likely be a combination of a and c, assuming the open-source nature of `autogen` and suggesting the user verify this with maintainers.\n\n6. **Final Response:**\n Yes, you can potentially create and contribute a new package to the `autogen` framework. It's a great way to enhance the ecosystem of tools. To ensure you're following the correct process and guidelines, I recommend contacting the maintainers of `autogen` for specific instructions regarding contributions.", + "usage": { + "prompt_tokens": 182, + "completion_tokens": 583 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to extract a possible answer to the following question from the given text.\n- First review the following task.\n- Then review the text that follows, which may an answer, plus reasoning that led to the answer.\n- Do not attempt to actually solve the task yourself.\n- Don't try to judge whether the reasoning steps were correct.\n- Simply respond by summarizing the answer described in the text, omitting any other parts of the text.\n- If no answer is present can be extracted from the text, simply reply \"None\".", + "\n# Task description", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?", + "\n# Text that may contain an answer", + "1. **Understanding the User Message:**\n The user is inquiring about the possibility of creating a new package for the `autogen` framework, specifically for a \"copilot extension agent\" that they have developed. This indicates that the user has developed an extension or module utilizing `autogen` and wants to know if they can contribute this as a package to the `autogen` ecosystem.\n\n2. **Possible Responses:**\n a. Yes, you can create and contribute a new package to the `autogen` framework. Here's how you can do it.\n b. No, currently `autogen` does not accept community contributions for new packages.\n c. Contact the maintainers of `autogen` for more specific guidance on contributing new packages.\n d. Provide the user with a general process for contributing to open-source projects, which may include documentation, guidelines, and points of contact for contributions.\n\n3. **Pros and Cons of Each Response:**\n a. **Pros:** Encourages community involvement and can lead to a richer ecosystem. Provides a positive and actionable response. **Cons:** Assumes the open-source nature and plugin capability of `autogen`, which may not be accurate without further information.\n b. **Pros:** Provides a clear and definitive answer if true. **Cons:** Potentially discouraging for the user if not accurate, and lacks constructive guidance.\n c. **Pros:** Directs the user to the source of authoritative information, ensuring they get the correct guidance. **Cons:** Requires additional effort from the user to seek out and contact maintainers.\n d. **Pros:** Offers a general framework that could be applicable to many projects, encourages good practices. **Cons:** Generic advice may not address specific requirements or processes of `autogen`.\n\n4. **Critique of the Pros and Cons:**\n - The assumption in response a might not hold if information about `autogen`'s contribution policy is not known. However, it remains a highly constructive approach.\n - Response b is only useful if factually correct, but without definite knowledge, it could mislead.\n - Response c effectively puts the onus of action on the user but ensures they get accurate and relevant information.\n - Response d is practical but might not provide the specificities needed for `autogen`.\n\n5. **Decision on Best Response:**\n - The best response would likely be a combination of a and c, assuming the open-source nature of `autogen` and suggesting the user verify this with maintainers.\n\n6. **Final Response:**\n Yes, you can potentially create and contribute a new package to the `autogen` framework. It's a great way to enhance the ecosystem of tools. To ensure you're following the correct process and guidelines, I recommend contacting the maintainers of `autogen` for specific instructions regarding contributions." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Yes, you can potentially create and contribute a new package to the `autogen` framework. To ensure you're following the correct process and guidelines, I recommend contacting the maintainers of `autogen` for specific instructions regarding contributions.", + "usage": { + "prompt_tokens": 749, + "completion_tokens": 47 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to decide whether a given answer to a task is correct or not.\n- You will be given the task description and the correct, gold-standard answer, along with the answer to be graded.\n- In general, an answer is correct if it is equivalent to the correct answer.\n- Specifically, the given answer must contain the important information from the correct answer, and must not in any way contradict the correct answer.\n- Ignore any differences of grammar, spelling mistakes, punctuation, capitalization, formatting, or extra commentary.\n- An answer should be considered correct if it omits information that is clearly inferred.\n - For instance, if the correct answer is \"Paris, France\", the answer \"Paris\" should be considered correct.\n- Respond with a single character: '1' if the answer to be graded is correct\", '0' if not.", + "\n# Task description", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?", + "\n# Correct answer", + "It's best to have your agent in its own repo, then add the autogen-extension topic to that repo.", + "\n# Answer to be graded", + "Yes, you can potentially create and contribute a new package to the `autogen` framework. To ensure you're following the correct process and guidelines, I recommend contacting the maintainers of `autogen` for specific instructions regarding contributions." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "0", + "usage": { + "prompt_tokens": 300, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Does the following text contain any information or advice that might be useful later?\n- If so, please copy the information or advice, adding no extra commentary or formatting.\n- If there is no potentially useful information or advice at all, simply write \"None\" with no punctuation.", + "\n# Text to analyze", + "When somebody builds something on autogen and wants to contribute it to autogen, instead of creating a new autogen package, it's better for them to implement it in a separate github repo, then just add the autogen-extension topic to that repo. That way their contribution will be automatically discoverable through autogen's page on community extensions: https://microsoft.github.io/autogen/dev/user-guide/extensions-user-guide/index.html" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "When somebody builds something on autogen and wants to contribute it to autogen, instead of creating a new autogen package, it's better for them to implement it in a separate github repo, then just add the autogen-extension topic to that repo. That way their contribution will be automatically discoverable through autogen's page on community extensions: https://microsoft.github.io/autogen/dev/user-guide/extensions-user-guide/index.html", + "usage": { + "prompt_tokens": 165, + "completion_tokens": 86 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are an expert at semantic analysis.", + "type": "SystemMessage" + }, + { + "content": [ + "- My job is to create a thorough index for a book called Task Completion, and I need your help.\n- Every paragraph in the book needs to be indexed by all the topics related to various kinds of tasks and strategies for completing them.\n- Your job is to read the text below and extract the task-completion topics that are covered.\n- The number of topics depends on the length and content of the text. But you should list at least one topic, and potentially many more.\n- Each topic you list should be a meaningful phrase composed of a few words. Don't use whole sentences as topics.\n- Don't include details that are unrelated to the general nature of the task, or a potential strategy for completing tasks.\n- List each topic on a separate line, without any extra text like numbering, or bullets, or any other formatting, because we don't want those things in the index of the book.\n\n", + "# Text to be indexed\n", + "When somebody builds something on autogen and wants to contribute it to autogen, instead of creating a new autogen package, it's better for them to implement it in a separate github repo, then just add the autogen-extension topic to that repo. That way their contribution will be automatically discoverable through autogen's page on community extensions: https://microsoft.github.io/autogen/dev/user-guide/extensions-user-guide/index.html" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "autogen integration \ncommunity extensions \nGitHub repository management \ncontribution discoverability \nautogen-extension topic \nseparate implementation strategy", + "usage": { + "prompt_tokens": 292, + "completion_tokens": 27 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.\nIn responding to every user message, you follow the same multi-step process given here:\n1. Explain your understanding of the user message in detail, covering all the important points.\n2. List as many possible responses as you can think of.\n3. Carefully list and weigh the pros and cons (if any) of each possible response.\n4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist.\n5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory.\n6. Finish by providing your final response in the particular format requested by the user.", + "type": "SystemMessage" + }, + { + "content": "When somebody builds something on autogen and wants to contribute it to autogen, instead of creating a new autogen package, it's better for them to implement it in a separate github repo, then just add the autogen-extension topic to that repo. That way their contribution will be automatically discoverable through autogen's page on community extensions: https://microsoft.github.io/autogen/dev/user-guide/extensions-user-guide/index.html", + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1. My understanding of your message is that you are suggesting a best practice for developers who create extensions for Autogen. Instead of creating a new Autogen package to contribute their work, they should implement their extension in a separate GitHub repository and apply the \"autogen-extension\" topic to it. This approach makes the contribution automatically discoverable through Autogen's community extensions page, accessible via the provided link.\n\n2. Possible responses include:\n a. Agreeing with your suggestion and providing additional details or insights.\n b. Explaining the advantages of your suggested approach.\n c. Discussing potential reasons why some might still choose to create a new Autogen package.\n d. Asking clarifying questions to ensure complete understanding or to gather more context.\n\n3. Pros and Cons of each response:\n a. Agreeing with your suggestion:\n - Pros: Reinforces your recommendation, encourages good practices, and aligns with the community\u2019s discovery process.\n - Cons: May not offer new insights or value if the reader is already familiar with the process.\n b. Explaining the advantages:\n - Pros: Provides a clear rationale for the suggested approach, helping others understand its benefits.\n - Cons: Might be redundant if the audience is already aware of these advantages.\n c. Discussing potential reasons for creating a new package:\n - Pros: Acknowledges that there might be scenarios where a separate package is suitable, offering a balanced view.\n - Cons: Could confuse or mislead those looking for straightforward guidance.\n d. Asking clarifying questions:\n - Pros: Ensures complete understanding and allows for a more tailored response.\n - Cons: Delays providing a direct response and may not be necessary if the original message is clear.\n\n4. Critiquing the pros and cons:\n - Option a is straightforward and helpful, reinforcing good practices without overcomplicating the response.\n - Option b is beneficial as it provides a deeper understanding, though it could overlap with known information.\n - Option c offers a comprehensive view but could divert from the main intention of promoting the suggested approach.\n - Option d is useful for clarity but unnecessary if the message is already clear.\n\n5. The best response:\n - Option a is the most suitable as it directly supports your suggestion and encourages best practices without introducing potential confusion.\n\n6. Final Response:\nI agree with your suggestion. Implementing the extension in a separate GitHub repository and adding the \"autogen-extension\" topic is a great way to ensure that contributions are automatically discoverable through Autogen's community extensions page. This approach aligns with the current community practices and makes it easier for others to find and utilize the extension.", + "usage": { + "prompt_tokens": 239, + "completion_tokens": 545 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Does the following text contain any information or advice that might be useful later?\n- If so, please copy the information or advice, adding no extra commentary or formatting.\n- If there is no potentially useful information or advice at all, simply write \"None\" with no punctuation.", + "\n# Text to analyze", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "None", + "usage": { + "prompt_tokens": 108, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "### Simplified Task Explanation\n\n1. **Objective**: The task involves contributing to the autogen project.\n2. **Creation**: The task is to create a new package.\n3. **Purpose of Package**: The package is for a copilot extension agent.\n4. **Platform**: This package should be created using the autogen framework.\n5. **Ownership**: The copilot extension agent referenced is one that you have developed.\n\nThis list outlines the key elements and requirements of the task in simpler terms.", + "usage": { + "prompt_tokens": 115, + "completion_tokens": 107 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "### Simplified Task Explanation\n\n1. **Objective**: The task involves contributing to the autogen project.\n2. **Creation**: The task is to create a new package.\n3. **Purpose of Package**: The package is for a copilot extension agent.\n4. **Platform**: This package should be created using the autogen framework.\n5. **Ownership**: The copilot extension agent referenced is one that you have developed.\n\nThis list outlines the key elements and requirements of the task in simpler terms.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "In the context of the task, all points listed are relevant to understanding and addressing the task:\n\n1. **Objective**: Understanding that this is a contribution to the autogen project is critical, as it sets the context and purpose of the task.\n2. **Creation**: Knowing that the task is to create a new package is essential, as it defines the action to be taken.\n3. **Purpose of Package**: Recognizing that the package is for a copilot extension agent helps clarify the specific type of package being developed.\n4. **Platform**: Being aware that the package should be created using the autogen framework is crucial, as it specifies the tools and environment to be used.\n5. **Ownership**: Knowing that the copilot extension agent is one that you have developed ensures you have the necessary background and understanding of the agent, which is relevant for creating the package.\n\nAll items provide necessary context or requirements crucial to effectively carrying out the task.", + "usage": { + "prompt_tokens": 259, + "completion_tokens": 196 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a task description. Our job is not to complete the task, but merely rephrase the task in simpler, more general terms, if possible. Please reach through the following task description, then explain your understanding of the task in detail, as a single, flat list of all the important points.", + "\n# Task description", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?" + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "### Simplified Task Explanation\n\n1. **Objective**: The task involves contributing to the autogen project.\n2. **Creation**: The task is to create a new package.\n3. **Purpose of Package**: The package is for a copilot extension agent.\n4. **Platform**: This package should be created using the autogen framework.\n5. **Ownership**: The copilot extension agent referenced is one that you have developed.\n\nThis list outlines the key elements and requirements of the task in simpler terms.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Do you see any parts of this list that are irrelevant to actually solving the task? If so, explain which items are irrelevant." + ], + "source": "User", + "type": "UserMessage" + }, + { + "content": "In the context of the task, all points listed are relevant to understanding and addressing the task:\n\n1. **Objective**: Understanding that this is a contribution to the autogen project is critical, as it sets the context and purpose of the task.\n2. **Creation**: Knowing that the task is to create a new package is essential, as it defines the action to be taken.\n3. **Purpose of Package**: Recognizing that the package is for a copilot extension agent helps clarify the specific type of package being developed.\n4. **Platform**: Being aware that the package should be created using the autogen framework is crucial, as it specifies the tools and environment to be used.\n5. **Ownership**: Knowing that the copilot extension agent is one that you have developed ensures you have the necessary background and understanding of the agent, which is relevant for creating the package.\n\nAll items provide necessary context or requirements crucial to effectively carrying out the task.", + "thought": null, + "source": "Assistant", + "type": "AssistantMessage" + }, + { + "content": [ + "Revise your original list to include only the most general terms, those that are critical to solving the task, removing any themes or descriptions that are not essential to the solution. Your final list may be shorter, but do not leave out any part of the task that is needed for solving the task. Do not add any additional commentary either before or after the list." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1. Contribute to autogen.\n2. Create a new package.\n3. Package is for a copilot extension agent.", + "usage": { + "prompt_tokens": 539, + "completion_tokens": 27 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are an expert at semantic analysis.", + "type": "SystemMessage" + }, + { + "content": [ + "- My job is to create a thorough index for a book called Task Completion, and I need your help.\n- Every paragraph in the book needs to be indexed by all the topics related to various kinds of tasks and strategies for completing them.\n- Your job is to read the text below and extract the task-completion topics that are covered.\n- The number of topics depends on the length and content of the text. But you should list at least one topic, and potentially many more.\n- Each topic you list should be a meaningful phrase composed of a few words. Don't use whole sentences as topics.\n- Don't include details that are unrelated to the general nature of the task, or a potential strategy for completing tasks.\n- List each topic on a separate line, without any extra text like numbering, or bullets, or any other formatting, because we don't want those things in the index of the book.\n\n", + "# Text to be indexed\n", + "1. Contribute to autogen.\n2. Create a new package.\n3. Package is for a copilot extension agent." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "Contributing to projects \nCreating software packages \nDeveloping extensions \nCollaborating with AI agents", + "usage": { + "prompt_tokens": 233, + "completion_tokens": 19 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "We have been given a potential insight that may or may not be useful for solving a given task.\n- First review the following task.\n- Then review the insight that follows, and consider whether it might help solve the given task.\n- Do not attempt to actually solve the task.\n- Reply with a single character, '1' if the insight may be useful, or '0' if it is not.", + "\n# Task description", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?", + "\n# Possibly useful insight", + "When somebody builds something on autogen and wants to contribute it to autogen, instead of creating a new autogen package, it's better for them to implement it in a separate github repo, then just add the autogen-extension topic to that repo. That way their contribution will be automatically discoverable through autogen's page on community extensions: https://microsoft.github.io/autogen/dev/user-guide/extensions-user-guide/index.html" + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1", + "usage": { + "prompt_tokens": 223, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.\nIn responding to every user message, you follow the same multi-step process given here:\n1. Explain your understanding of the user message in detail, covering all the important points.\n2. List as many possible responses as you can think of.\n3. Carefully list and weigh the pros and cons (if any) of each possible response.\n4. Critique the pros and cons above, looking for any flaws in your reasoning. But don't make up flaws that don't exist.\n5. Decide on the best response, looping back to step 1 if none of the responses are satisfactory.\n6. Finish by providing your final response in the particular format requested by the user.", + "type": "SystemMessage" + }, + { + "content": "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?\n\n## Important insights that may help solve tasks like this\n- When somebody builds something on autogen and wants to contribute it to autogen, instead of creating a new autogen package, it's better for them to implement it in a separate github repo, then just add the autogen-extension topic to that repo. That way their contribution will be automatically discoverable through autogen's page on community extensions: https://microsoft.github.io/autogen/dev/user-guide/extensions-user-guide/index.html\n", + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "### Understanding of the User Message\n\nYou have developed a copilot extension agent using the Autogen framework and are considering contributing this as a new package to Autogen. You are seeking guidance on whether creating a new Autogen package would be appropriate or how best to contribute your work.\n\n### Possible Responses\n\n1. **Suggest creating a separate GitHub repository and adding the `autogen-extension` topic.**\n - Explain the advantages of this approach, including discoverability through the Autogen community extensions page.\n\n2. **Advise on creating a new Autogen package within the Autogen framework.**\n - Outline steps for integrating directly into Autogen, if possible.\n\n3. **Provide a comparison of both approaches, highlighting the benefits of each, and let the user decide.**\n - Suggest both methods with pros and cons, allowing the user to choose based on their preferences and goals.\n\n4. **Encourage joining community discussions about Autogen to seek further advice and collaborate with others.**\n - Direct them to forums or community gatherings related to Autogen for support and networking.\n\n### Pros and Cons of Each Response\n\n1. **Creating a separate GitHub repository:**\n - **Pros:** \n - Simplifies the contribution process.\n - Enhances discoverability through the Autogen community page.\n - Allows the contributor to maintain control over their project.\n - **Cons:**\n - May feel less integrated into the Autogen core ecosystem.\n\n2. **Creating a new Autogen package:**\n - **Pros:** \n - Direct integration into Autogen could provide tighter coupling with the platform.\n - **Cons:**\n - More complex process that may not be necessary for extensions.\n - Could be redundant if not aligned with the Autogen contribution guidelines.\n\n3. **Providing both approaches and letting the user decide:**\n - **Pros:** \n - Empowers the user to make an informed decision.\n - Provides a comprehensive overview.\n - **Cons:** \n - May overwhelm the user if they are looking for a direct recommendation.\n\n4. **Encouraging community engagement:**\n - **Pros:** \n - Connects the user with a wider community for support.\n - Allows sharing of experiences and advice.\n - **Cons:** \n - Indirect approach that doesn't immediately address the user's question.\n\n### Critique of the Pros and Cons\n\nThe first option seems the most aligned with the information provided, as it adheres to the suggested guidelines for contributing to Autogen. The simplicity and emphasis on discoverability make it a strong choice. The second option, while viable, appears to be more complex and not necessarily in line with the recommended contribution method. The third option gives flexibility but could be unnecessary if the user is looking for a straightforward answer. The fourth option, while beneficial for networking, doesn't directly answer the user's query.\n\n### Decision on the Best Response\n\nThe first response aligns with the provided guidelines and offers a straightforward solution for contributing to Autogen while ensuring discoverability.\n\n### Final Response\n\nTo contribute your copilot extension agent to Autogen, it's recommended to create a separate GitHub repository and add the `autogen-extension` topic to it. This approach ensures your contribution is automatically discoverable through Autogen's community extensions page, enhancing visibility and allowing others to easily find and use your extension.", + "usage": { + "prompt_tokens": 280, + "completion_tokens": 680 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to extract a possible answer to the following question from the given text.\n- First review the following task.\n- Then review the text that follows, which may an answer, plus reasoning that led to the answer.\n- Do not attempt to actually solve the task yourself.\n- Don't try to judge whether the reasoning steps were correct.\n- Simply respond by summarizing the answer described in the text, omitting any other parts of the text.\n- If no answer is present can be extracted from the text, simply reply \"None\".", + "\n# Task description", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?", + "\n# Text that may contain an answer", + "### Understanding of the User Message\n\nYou have developed a copilot extension agent using the Autogen framework and are considering contributing this as a new package to Autogen. You are seeking guidance on whether creating a new Autogen package would be appropriate or how best to contribute your work.\n\n### Possible Responses\n\n1. **Suggest creating a separate GitHub repository and adding the `autogen-extension` topic.**\n - Explain the advantages of this approach, including discoverability through the Autogen community extensions page.\n\n2. **Advise on creating a new Autogen package within the Autogen framework.**\n - Outline steps for integrating directly into Autogen, if possible.\n\n3. **Provide a comparison of both approaches, highlighting the benefits of each, and let the user decide.**\n - Suggest both methods with pros and cons, allowing the user to choose based on their preferences and goals.\n\n4. **Encourage joining community discussions about Autogen to seek further advice and collaborate with others.**\n - Direct them to forums or community gatherings related to Autogen for support and networking.\n\n### Pros and Cons of Each Response\n\n1. **Creating a separate GitHub repository:**\n - **Pros:** \n - Simplifies the contribution process.\n - Enhances discoverability through the Autogen community page.\n - Allows the contributor to maintain control over their project.\n - **Cons:**\n - May feel less integrated into the Autogen core ecosystem.\n\n2. **Creating a new Autogen package:**\n - **Pros:** \n - Direct integration into Autogen could provide tighter coupling with the platform.\n - **Cons:**\n - More complex process that may not be necessary for extensions.\n - Could be redundant if not aligned with the Autogen contribution guidelines.\n\n3. **Providing both approaches and letting the user decide:**\n - **Pros:** \n - Empowers the user to make an informed decision.\n - Provides a comprehensive overview.\n - **Cons:** \n - May overwhelm the user if they are looking for a direct recommendation.\n\n4. **Encouraging community engagement:**\n - **Pros:** \n - Connects the user with a wider community for support.\n - Allows sharing of experiences and advice.\n - **Cons:** \n - Indirect approach that doesn't immediately address the user's question.\n\n### Critique of the Pros and Cons\n\nThe first option seems the most aligned with the information provided, as it adheres to the suggested guidelines for contributing to Autogen. The simplicity and emphasis on discoverability make it a strong choice. The second option, while viable, appears to be more complex and not necessarily in line with the recommended contribution method. The third option gives flexibility but could be unnecessary if the user is looking for a straightforward answer. The fourth option, while beneficial for networking, doesn't directly answer the user's query.\n\n### Decision on the Best Response\n\nThe first response aligns with the provided guidelines and offers a straightforward solution for contributing to Autogen while ensuring discoverability.\n\n### Final Response\n\nTo contribute your copilot extension agent to Autogen, it's recommended to create a separate GitHub repository and add the `autogen-extension` topic to it. This approach ensures your contribution is automatically discoverable through Autogen's community extensions page, enhancing visibility and allowing others to easily find and use your extension." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "To contribute your copilot extension agent to Autogen, it's recommended to create a separate GitHub repository and add the `autogen-extension` topic to it.", + "usage": { + "prompt_tokens": 846, + "completion_tokens": 33 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + }, + { + "mode": "create", + "messages": [ + { + "content": "You are a helpful and thoughtful assistant.", + "type": "SystemMessage" + }, + { + "content": [ + "Your job is to decide whether a given answer to a task is correct or not.\n- You will be given the task description and the correct, gold-standard answer, along with the answer to be graded.\n- In general, an answer is correct if it is equivalent to the correct answer.\n- Specifically, the given answer must contain the important information from the correct answer, and must not in any way contradict the correct answer.\n- Ignore any differences of grammar, spelling mistakes, punctuation, capitalization, formatting, or extra commentary.\n- An answer should be considered correct if it omits information that is clearly inferred.\n - For instance, if the correct answer is \"Paris, France\", the answer \"Paris\" should be considered correct.\n- Respond with a single character: '1' if the answer to be graded is correct\", '0' if not.", + "\n# Task description", + "As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen?", + "\n# Correct answer", + "It's best to have your agent in its own repo, then add the autogen-extension topic to that repo.", + "\n# Answer to be graded", + "To contribute your copilot extension agent to Autogen, it's recommended to create a separate GitHub repository and add the `autogen-extension` topic to it." + ], + "source": "User", + "type": "UserMessage" + } + ], + "response": { + "finish_reason": "stop", + "content": "1", + "usage": { + "prompt_tokens": 286, + "completion_tokens": 2 + }, + "cached": false, + "logprobs": null, + "thought": null + }, + "stream": [] + } +] \ No newline at end of file diff --git a/python/packages/autogen-ext/tests/task_centric_memory/test_chat_completion_client_recorder.py b/python/packages/autogen-ext/tests/task_centric_memory/test_chat_completion_client_recorder.py new file mode 100644 index 000000000000..11cb90279a03 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/test_chat_completion_client_recorder.py @@ -0,0 +1,70 @@ +import asyncio + +import pytest +from autogen_core.models import ( + CreateResult, + UserMessage, +) +from autogen_ext.experimental.task_centric_memory.utils import PageLogger +from autogen_ext.experimental.task_centric_memory.utils.chat_completion_client_recorder import ( + ChatCompletionClientRecorder, +) +from autogen_ext.models.replay import ReplayChatCompletionClient + +session_file_path = str("./session_1.json") + + +@pytest.mark.asyncio +async def test_record() -> None: + """Test that in record mode, create() records the interaction and writes to disk on finalize().""" + logger = PageLogger(config={"level": "DEBUG", "path": "./logs"}) + logger.enter_function() + + mock_client = ReplayChatCompletionClient( + [ + "Response to message 1", + ] + ) + recorder = ChatCompletionClientRecorder( + mock_client, mode="record", session_file_path=session_file_path, logger=logger + ) + + messages = [UserMessage(content="Message 1", source="User")] + response = await recorder.create(messages) + assert isinstance(response, CreateResult) + assert response.content == "Response to message 1" + + recorder.finalize() + logger.leave_function() + + +@pytest.mark.asyncio +async def test_replay() -> None: + """ + Test that in replay mode, create() replays the recorded response if the messages match, + and raises an error if they do not or if records run out. + """ + logger = PageLogger(config={"level": "DEBUG", "path": "./logs"}) + logger.enter_function() + + mock_client = ReplayChatCompletionClient( + [ + "Response that should not be returned", + ] + ) + recorder = ChatCompletionClientRecorder( + mock_client, mode="replay", session_file_path=session_file_path, logger=logger + ) + + messages = [UserMessage(content="Message 1", source="User")] + response = await recorder.create(messages) + assert isinstance(response, CreateResult) + assert response.content == "Response to message 1" + + recorder.finalize() + logger.leave_function() + + +if __name__ == "__main__": + asyncio.run(test_record()) + asyncio.run(test_replay()) diff --git a/python/packages/autogen-ext/tests/task_centric_memory/test_learning_from_demonstration.py b/python/packages/autogen-ext/tests/task_centric_memory/test_learning_from_demonstration.py new file mode 100644 index 000000000000..6fc1c54b0af4 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/test_learning_from_demonstration.py @@ -0,0 +1,133 @@ +import asyncio +import sys +from typing import Any, Dict, Literal + +import pytest +from autogen_core.models import ( + ChatCompletionClient, +) +from autogen_ext.experimental.task_centric_memory.utils import ( + Apprentice, + ChatCompletionClientRecorder, + Grader, + PageLogger, +) +from autogen_ext.models.replay import ReplayChatCompletionClient +from utils import create_oai_client, load_yaml_file + +""" +This code sample connects task-centric memory to a selectable agent with no changes to that agent's code. +See the block diagram in the README for an overview of the components and their interactions. +See the config file configs/demonstration.yaml for an overall view of the structure and settings in this sample. + +Execute the sample with this command: + python eval_learning_from_demonstration.py configs/demonstration.yaml + +Here, to learn from a demonstration means to remember a previously demonstrated solution for the same or a similar task. + +1. The function below asks the agent to perform a reasoning task (ten times) on which it usually fails. +2. Then agent is then given one demonstration of how to solve a similar but different task, and the context window is cleared. +3. Finally the agent is tested 10 more times to see if it can retrieve and apply the demonstration to the original task. + +If adapting this sample code to a new setting, the Apprentice class can be used or completely replaced by other code. +""" + + +async def eval_learning_from_demonstration( + apprentice: Apprentice, client: ChatCompletionClient, logger: PageLogger, config: Dict[str, Any] +) -> str: + """ + Evaluates the ability to learn quickly from demonstrations. + """ + logger.enter_function() + + num_trials = config["num_trials"] + grader = Grader(client, logger) + + # Load the specified data. + main_task = load_yaml_file(config["main_task_file"]) + task_description = main_task["task_description"] + expected_answer = main_task["expected_answer"] + demo_task = load_yaml_file(config["demo_task_file"])["task_description"] + demo_solution = load_yaml_file(config["demo_solution_file"])["insight"] + + # Start by clearing memory then running a baseline test. + logger.info("To get a baseline, clear memory, then assign the task.") + apprentice.reset_memory() + num_successes, num_trials = await grader.test_apprentice( + apprentice=apprentice, + task_description=task_description, + expected_answer=expected_answer, + num_trials=num_trials, + use_memory=True, + client=client, + ) + success_rate = round((num_successes / num_trials) * 100) + results_str_1 = "Success rate before demonstration: {}%".format(success_rate) + logger.info("\n" + results_str_1) + + # Provide a demonstration for a similar but different task. + logger.info("Demonstrate a solution to a similar task.") + await apprentice.add_task_solution_pair_to_memory(demo_task, demo_solution) + + # Now test again to see if the demonstration (retrieved from memory) helps. + logger.info("Assign the task again to see if the demonstration helps.") + num_successes, num_trials = await grader.test_apprentice( + apprentice=apprentice, + task_description=task_description, + expected_answer=expected_answer, + num_trials=num_trials, + use_memory=True, + client=client, + ) + success_rate = round((num_successes / num_trials) * 100) + results_str_2 = "Success rate after demonstration: {}%".format(success_rate) + logger.info("\n" + results_str_2) + + logger.leave_function() + return "\neval_learning_from_demonstration\n" + results_str_1 + "\n" + results_str_2 + + +@pytest.mark.asyncio +async def test_memory(mode: Literal["record", "replay"] = "replay") -> None: + """ + Tests memory using the components specified in the config file. + By default, mode is "replay", which uses a pre-recorded session file. + If mode is "record", a new session file is generated for future replay. + """ + test = "demonstration" + config = load_yaml_file(f"./tests/task_centric_memory/configs/{test}.yaml") + + # Create the necessary components. + logger = PageLogger(config["PageLogger"]) + if mode == "record": + # To record a session, we need a real client. + base_client = create_oai_client(config["client"]) + else: + # To replay a session (as in pytest), we can use a mock client. + base_client = ReplayChatCompletionClient( + [ + "not used", + ] + ) + client = ChatCompletionClientRecorder( + base_client, mode, f"./tests/task_centric_memory/sessions/{test}/session.json", logger + ) + apprentice = Apprentice(client, config["Apprentice"], logger) + + # Call the example function. + await eval_learning_from_demonstration(apprentice, client, logger, config["test"]) + + # Clean up. + client.finalize() + + +if __name__ == "__main__": + args = sys.argv[1:] + # Replay mode is enabled by default. + # Record mode is enabled if the first argument is "record". + # Use this to generate a new session file for pytest to use. + mode: Literal["record", "replay"] = "replay" + if (len(args) >= 1) and (args[0] == "record"): + mode = "record" + asyncio.run(test_memory(mode=mode)) diff --git a/python/packages/autogen-ext/tests/task_centric_memory/test_self_teaching.py b/python/packages/autogen-ext/tests/task_centric_memory/test_self_teaching.py new file mode 100644 index 000000000000..45405ff4d66a --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/test_self_teaching.py @@ -0,0 +1,150 @@ +import asyncio +import sys +from typing import Any, Dict, Literal + +import pytest +from autogen_core.models import ( + ChatCompletionClient, +) +from autogen_ext.experimental.task_centric_memory.utils import ( + Apprentice, + ChatCompletionClientRecorder, + Grader, + PageLogger, +) +from autogen_ext.models.replay import ReplayChatCompletionClient +from utils import create_oai_client, load_yaml_file + +""" +This code sample connects task-centric memory to a selectable agent with no changes to that agent's code. +See the block diagram in the README for an overview of the components and their interactions. +See the config file configs/self_teaching.yaml for an overall view of the structure and settings in this sample. + +Execute the sample with this command: + python eval_self_teaching.py configs/self_teaching.yaml + +We say that an agent is self-teaching if it can learn quickly from its own trial and error with no user input. +This sample asks the agent to perform a reasoning task on which it usually fails. +Then using automatic success or failure feedback (for a verifiable task with no side-effects on the environment), +the agent iterates through a background learning loop to find a solution, which it then stores as an insight in memory. +Finally the agent is tested again to see if it can retrieve and apply its insight to the original task, +as well as to a similar but different task as a test of generalization. + +If adapting this sample code to a new setting, the Apprentice class can be used or completely replaced by other code. +""" + + +async def eval_self_teaching( + apprentice: Apprentice, client: ChatCompletionClient, logger: PageLogger, config: Dict[str, Any] +) -> str: + """ + Evaluates the ability of an agent to learn quickly from its own trial and error. + """ + logger.enter_function() + + num_loops = config["num_loops"] + num_final_test_trials = config["num_final_test_trials"] + grader = Grader(client, logger) + + # Load the specified data. + task_dict_1 = load_yaml_file(config["task_file_1"]) + task_description_1 = task_dict_1["task_description"] + expected_answer_1 = task_dict_1["expected_answer"] + + # Test generalization on this different, similar task. + task_dict_2 = load_yaml_file(config["task_file_2"]) + task_description_2 = task_dict_2["task_description"] + expected_answer_2 = task_dict_2["expected_answer"] + + # Start the test with empty memory. + apprentice.reset_memory() + + total_num_successes_1 = 0 + total_num_successes_2 = 0 + total_num_trials = 0 + for _ in range(num_loops): + # Train on the first task. + await apprentice.train_on_task(task=task_description_1, expected_answer=expected_answer_1) + + # Test on the first task. + num_successes, num_trials = await grader.test_apprentice( + apprentice=apprentice, + task_description=task_description_1, + expected_answer=expected_answer_1, + num_trials=num_final_test_trials, + use_memory=True, + client=client, + ) + logger.info("Task 1 success rate: {}%".format(round((num_successes / num_trials) * 100))) + total_num_successes_1 += num_successes + + # Test on the second task. + num_successes, num_trials = await grader.test_apprentice( + apprentice=apprentice, + task_description=task_description_2, + expected_answer=expected_answer_2, + num_trials=num_final_test_trials, + use_memory=True, + client=client, + ) + logger.info("Task 2 success rate: {}%".format(round((num_successes / num_trials) * 100))) + total_num_successes_2 += num_successes + + total_num_trials += num_final_test_trials + logger.info("") + + overall_success_rate_1 = round((total_num_successes_1 / total_num_trials) * 100) + overall_success_rate_2 = round((total_num_successes_2 / total_num_trials) * 100) + + results_str_1 = "Overall task 1 success rate: {}%".format(overall_success_rate_1) + results_str_2 = "Overall task 2 success rate: {}%".format(overall_success_rate_2) + logger.info("\n" + results_str_1) + logger.info(results_str_2) + + logger.leave_function() + return "\neval_self_teaching\n" + results_str_1 + "\n" + results_str_2 + + +@pytest.mark.asyncio +async def test_memory(mode: Literal["record", "replay"] = "replay") -> None: + """ + Tests memory using the components specified in the config file. + By default, mode is "replay", which uses a pre-recorded session file. + If mode is "record", a new session file is generated for future replay. + """ + test = "self_teaching" + config = load_yaml_file(f"./tests/task_centric_memory/configs/{test}.yaml") + + # Create the necessary components. + logger = PageLogger(config["PageLogger"]) + if mode == "record": + # To record a session, we need a real client. + base_client = create_oai_client(config["client"]) + else: + # To replay a session (as in pytest), we can use a mock client. + base_client = ReplayChatCompletionClient( + [ + "not used", + ] + ) + client = ChatCompletionClientRecorder( + base_client, mode, f"./tests/task_centric_memory/sessions/{test}/session.json", logger + ) + apprentice = Apprentice(client, config["Apprentice"], logger) + + # Call the example function. + await eval_self_teaching(apprentice, client, logger, config["test"]) + + # Clean up. + client.finalize() + + +if __name__ == "__main__": + args = sys.argv[1:] + # Replay mode is enabled by default. + # Record mode is enabled if the first argument is "record". + # Use this to generate a new session file for pytest to use. + mode: Literal["record", "replay"] = "replay" + if (len(args) >= 1) and (args[0] == "record"): + mode = "record" + asyncio.run(test_memory(mode=mode)) diff --git a/python/packages/autogen-ext/tests/task_centric_memory/test_teachability.py b/python/packages/autogen-ext/tests/task_centric_memory/test_teachability.py new file mode 100644 index 000000000000..db0d9754cab5 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/test_teachability.py @@ -0,0 +1,134 @@ +import asyncio +import sys +from typing import Any, Dict, Literal + +import pytest +from autogen_core.models import ( + ChatCompletionClient, +) +from autogen_ext.experimental.task_centric_memory.utils import ( + Apprentice, + ChatCompletionClientRecorder, + Grader, + PageLogger, +) +from autogen_ext.models.replay import ReplayChatCompletionClient +from utils import create_oai_client, load_yaml_file + +""" +This code sample connects task-centric memory to a selectable agent with no changes to that agent's code. +See the block diagram in the README for an overview of the components and their interactions. +See the config file configs/eval_teachability.yaml for an overall view of the structure and settings in this sample. + +Execute the sample with this command: + python eval_teachability.py configs/eval_teachability.yaml + +Teachable agents use memory to learn quickly from user teachings, hints, and advice. +The function below passes user instructions (loaded from a file) to the agent by calling Apprentice.handle_user_message(). +If adapting this sample code to a new setting, the Apprentice class can be used or completely replaced by other code. + +1. In the first conversation, the agent is expected to fail because it lacks the necessary knowledge. +2. In the second conversation (starting with an empty context window), the user provides the missing insight. +3. In the third conversation, the agent is expected to succeed after retrieving the key insight from memory. +""" + + +async def eval_teachability( + apprentice: Apprentice, client: ChatCompletionClient, logger: PageLogger, config: Dict[str, Any] +) -> str: + """ + Evaluates the ability to learn quickly from user teachings, hints, and advice. + """ + logger.enter_function() + + # Load the specified data. + task_dict = load_yaml_file(config["task_file"]) + task_description = task_dict["task_description"] + expected_answer = task_dict["expected_answer"] + + insight_dict = load_yaml_file(config["insight_file"]) + insight = insight_dict["insight"] + + # First test without memory. + apprentice.reset_memory() + logger.info("\nClear memory, then ask the question.") + response = await apprentice.handle_user_message(task_description) + + # Check the response. + grader = Grader(client, logger) + response_is_correct, extracted_answer = await grader.is_response_correct( + task_description, response, expected_answer + ) + logger.info("Extracted answer: {}".format(extracted_answer)) + if response_is_correct: + results_str_1 = "Answer before teaching is CORRECT." + else: + results_str_1 = "Answer before teaching is INCORRECT." + logger.info(results_str_1 + "\n") + + # Give advice that should help solve this task. + logger.info("Give the advice.") + await apprentice.handle_user_message(insight) + + # Now ask the question again to see if the advice helps. + logger.info("\nAsk the question again to see if the advice helps.") + response = await apprentice.handle_user_message(task_description) + + # Check the response. + response_is_correct, extracted_answer = await grader.is_response_correct( + task_description, response, expected_answer + ) + logger.info("Extracted answer: {}".format(extracted_answer)) + if response_is_correct: + results_str_2 = "Answer after teaching is CORRECT." + else: + results_str_2 = "Answer after teaching is INCORRECT." + logger.info(results_str_2 + "\n") + + logger.leave_function() + return "\neval_teachability\n" + results_str_1 + "\n" + results_str_2 + + +@pytest.mark.asyncio +async def test_memory(mode: Literal["record", "replay"] = "replay") -> None: + """ + Tests memory using the components specified in the config file. + By default, mode is "replay", which uses a pre-recorded session file. + If mode is "record", a new session file is generated for future replay. + """ + test = "teachability" + config = load_yaml_file(f"./tests/task_centric_memory/configs/{test}.yaml") + + # Create the necessary components. + logger = PageLogger(config["PageLogger"]) + if mode == "record": + # To record a session, we need a real client. + base_client = create_oai_client(config["client"]) + else: + # To replay a session (as in pytest), we can use a mock client. + base_client = ReplayChatCompletionClient( + [ + "not used", + ] + ) + client = ChatCompletionClientRecorder( + base_client, mode, f"./tests/task_centric_memory/sessions/{test}/session.json", logger + ) + apprentice = Apprentice(client, config["Apprentice"], logger) + + # Call the example function. + await eval_teachability(apprentice, client, logger, config["test"]) + + # Clean up. + client.finalize() + + +if __name__ == "__main__": + args = sys.argv[1:] + # Replay mode is enabled by default. + # Record mode is enabled if the first argument is "record". + # Use this to generate a new session file for pytest to use. + mode: Literal["record", "replay"] = "replay" + if (len(args) >= 1) and (args[0] == "record"): + mode = "record" + asyncio.run(test_memory(mode=mode)) diff --git a/python/packages/autogen-ext/tests/task_centric_memory/utils.py b/python/packages/autogen-ext/tests/task_centric_memory/utils.py new file mode 100644 index 000000000000..196176915061 --- /dev/null +++ b/python/packages/autogen-ext/tests/task_centric_memory/utils.py @@ -0,0 +1,31 @@ +from typing import Any, Dict + +import yaml +from autogen_core.models import ( + ChatCompletionClient, +) +from autogen_ext.models.openai import OpenAIChatCompletionClient + + +def create_oai_client(config: Dict[str, Any]) -> ChatCompletionClient: + """ + Creates a chat completion client from OpenAI. + """ + client = OpenAIChatCompletionClient( + model=config["model"], + max_tokens=config["max_completion_tokens"], + max_retries=config["max_retries"], + temperature=config["temperature"], + presence_penalty=config["presence_penalty"], + frequency_penalty=config["frequency_penalty"], + top_p=config["top_p"], + ) + return client + + +def load_yaml_file(file_path: str) -> Any: + """ + Opens a file and returns its contents. + """ + with open(file_path, "r") as file: + return yaml.load(file, Loader=yaml.FullLoader) diff --git a/python/samples/task_centric_memory/README.md b/python/samples/task_centric_memory/README.md new file mode 100644 index 000000000000..2ae16228efc7 --- /dev/null +++ b/python/samples/task_centric_memory/README.md @@ -0,0 +1,138 @@ +# Task-Centric Memory Code Samples +_(EXPERIMENTAL, RESEARCH IN PROGRESS)_ + +

+ Description +

+ +This directory contains code samples that illustrate the following forms of fast, memory-based learning: +* Direct memory storage and retrieval +* Learning from user advice and corrections +* Learning from user demonstrations +* Learning from the agent's own experience + +Each sample connects task-centric memory to a selectable agent with no changes to that agent's code. +See the block diagram to the right for an overview of the components and their interactions. + +Each sample is contained in a separate python script, using data and configs stored in yaml files for easy modification. +Note that since agent behavior is non-deterministic, results will vary between runs. + +To watch operations live in a browser and see how task-centric memory works, +open the HTML page at the location specified at the top of the config file, +such as: `./pagelogs/teachability/0 Call Tree.html` +To turn off logging entirely, set logging level to NONE in the config file. + +The config files specify an _AssistantAgent_ by default, which uses a fixed, multi-step system prompt. +To use _MagenticOneGroupChat_ instead, specify that in the yaml file where indicated. + + +## Installation + +Install AutoGen and its extension package as follows: + +```bash +pip install -U "autogen-agentchat" "autogen-ext[openai]" "autogen-ext[task-centric-memory]" +``` + +Assign your OpenAI key to the environment variable OPENAI_API_KEY, +or else modify `utils/client.py` as appropriate for the model you choose. + + +## Running the Samples + +The following samples are listed in order of increasing complexity. +Execute the corresponding commands from this (autogen_ext/task_centric_memory) directory. + + +### Making AssistantAgent Teachable + +This short, interactive code sample shows how to make the AssistantAgent teachable. +The following steps show the agent learning a user teaching from one chat session to the next, +starting with an empty memory bank. +The memory bank can be cleared manually by deleting the memory_bank directory (if it exists from a prior run), as shown below. + +```bash +rm -r memory_bank +python chat_with_teachable_agent.py +Now chatting with a teachable agent. Please enter your first message. Type 'exit' or 'quit' to quit. + +You: How many items should be put in research summaries? +---------- user ---------- +How many items should be put in research summaries? +---------- teachable_agent ---------- + + +You: Whenever asked to prepare a research summary, try to cover just the 5 top items. +---------- user ---------- +Whenever asked to prepare a research summary, try to cover just the 5 top items. +---------- teachable_agent ---------- + + +You: quit + +python chat_with_teachable_agent.py` +Now chatting with a teachable agent. Please enter your first message. Type 'exit' or 'quit' to quit. + +You: How many items should be put in research summaries? +---------- user ---------- +How many items should be put in research summaries? +---------- teachable_agent ---------- +[MemoryContent(content='Whenever asked to prepare a research summary, try to cover just the 5 top items.', mime_type='MemoryMimeType.TEXT', metadata={})] +---------- teachable_agent ---------- + +``` + + +### Direct Memory Storage and Retrieval + +This sample shows how an app can access the `MemoryController` directly +to retrieve previously stored task-insight pairs as potentially useful examplars when solving some new task. +A task is any text instruction that the app may give to an agent. +An insight is any text (like a hint, advice, a demonstration or plan) that might help the agent perform such tasks. + +A typical app will perform the following steps in some interleaved order: +1. Call the `MemoryController` repeatedly to store a set of memories (task-insight pairs). +2. Call the `MemoryController` repeatedly to retrieve any memories related to a new task. +3. Use the retrieved insights, typically by adding them to the agent's context window. (This step is not illustrated by this code sample.) + +This sample code adds several task-insight pairs to memory, retrieves memories for a set of new tasks, +logs the full retrieval results, and reports the retrieval precision and recall. + +`python eval_retrieval.py configs/retrieval.yaml` + +Precision and recall for this sample are usually near 100%. + + +### Agent Learning from User Advice and Corrections + +This sample first tests the agent (once) for knowledge it currently lacks. +Then the agent is given advice to help it solve the task, and the context window is cleared. +Finally the agent is once tested again to see if it can retrieve and use the advice successfully. + +`python eval_teachability.py configs/teachability.yaml` + +With the benefit of memory, the agent usually succeeds on this sample. + + +### Agent Learning from User Demonstrations + +This sample asks the agent to perform a reasoning task (ten times) on which it usually fails. +The agent is then given one demonstration of how to solve a similar but different task, and the context window is cleared. +Finally the agent is tested 10 more times to see if it can retrieve and apply the demonstration to the original task. + +`python eval_learning_from_demonstration.py configs/demonstration.yaml` + +The agent's success rate tends to be measurably higher after the demonstration has been stored in memory. + + +### Agent Learning from Its Own Experience + +This sample asks the agent to perform a reasoning task on which it usually fails. +Then using automatic success or failure feedback (for a verifiable task with no side-effects on the environment), +the agent iterates through a background learning loop to find a solution, which it then stores as an insight in memory. +Finally the agent is tested again to see if it can retrieve and apply its insight to the original task, +as well as to a similar but different task as a test of generalization. + +`python eval_self_teaching.py configs/self_teaching.yaml` + +Using memory, the agent usually completes both tasks successfully in the second set of trials. diff --git a/python/samples/task_centric_memory/chat_with_teachable_agent.py b/python/samples/task_centric_memory/chat_with_teachable_agent.py new file mode 100644 index 000000000000..b68ed4e34bba --- /dev/null +++ b/python/samples/task_centric_memory/chat_with_teachable_agent.py @@ -0,0 +1,36 @@ +from autogen_agentchat.agents import AssistantAgent +from autogen_agentchat.ui import Console +from autogen_ext.models.openai import OpenAIChatCompletionClient +from autogen_ext.experimental.task_centric_memory import MemoryController +from autogen_ext.experimental.task_centric_memory.utils import Teachability + + +async def main(): + # Create a client + client = OpenAIChatCompletionClient(model="gpt-4o-2024-08-06", ) + + # Create an instance of Task-Centric Memory, passing minimal parameters for this simple example + memory_controller = MemoryController(reset=False, client=client) + + # Wrap the memory controller in a Teachability instance + teachability = Teachability(memory_controller=memory_controller) + + # Create an AssistantAgent, and attach teachability as its memory + assistant_agent = AssistantAgent( + name="teachable_agent", + system_message = "You are a helpful AI assistant, with the special ability to remember user teachings from prior conversations.", + model_client=client, + memory=[teachability], + ) + + # Enter a loop to chat with the teachable agent + print("Now chatting with a teachable agent. Please enter your first message. Type 'exit' or 'quit' to quit.") + while True: + user_input = input("\nYou: ") + if user_input.lower() in ["exit", "quit"]: + break + await Console(assistant_agent.run_stream(task=user_input)) + +if __name__ == "__main__": + import asyncio + asyncio.run(main()) diff --git a/python/samples/task_centric_memory/configs/demonstration.yaml b/python/samples/task_centric_memory/configs/demonstration.yaml new file mode 100644 index 000000000000..b423a460c0bb --- /dev/null +++ b/python/samples/task_centric_memory/configs/demonstration.yaml @@ -0,0 +1,31 @@ + +PageLogger: + level: DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL, or NONE. + path: ./pagelogs/demonstration + +client: + model: gpt-4o-2024-08-06 + temperature: 0.8 + max_completion_tokens: 4096 + presence_penalty: 0.0 + frequency_penalty: 0.0 + top_p: 1.0 + max_retries: 65535 + +Apprentice: + name_of_agent_or_team: AssistantAgent # AssistantAgent or MagenticOneGroupChat + disable_prefix_caching: 1 # If true, prepends a small random string to the context, to decorrelate repeated runs. + MemoryController: + max_train_trials: 10 + max_test_trials: 3 + MemoryBank: + path: ./memory_bank/demonstration + relevance_conversion_threshold: 1.7 + n_results: 25 + distance_threshold: 100 + +test: + main_task_file: data_files/tasks/cell_towers_1.yaml # The task being tested. + demo_task_file: data_files/tasks/cell_towers_2.yaml # A similar but different task. + demo_solution_file: data_files/insights/cell_towers_2_demo.yaml # A demonstration of solving the second task. + num_trials: 10 diff --git a/python/samples/task_centric_memory/configs/retrieval.yaml b/python/samples/task_centric_memory/configs/retrieval.yaml new file mode 100644 index 000000000000..9aef8268ff16 --- /dev/null +++ b/python/samples/task_centric_memory/configs/retrieval.yaml @@ -0,0 +1,38 @@ + +PageLogger: + level: DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL, or NONE. + path: ./pagelogs/retrieval + +client: + model: gpt-4o-2024-08-06 + temperature: 0.8 + max_completion_tokens: 4096 + presence_penalty: 0.0 + frequency_penalty: 0.0 + top_p: 1.0 + max_retries: 65535 + +MemoryController: + MemoryBank: + path: ./memory_bank/retrieval + relevance_conversion_threshold: 1.7 + n_results: 25 + distance_threshold: 100 + +test: + tasks: + - data_files/tasks/10_liars.yaml + - data_files/tasks/100_vampires.yaml + - data_files/tasks/autogen_package.yaml + - data_files/tasks/cell_towers_1.yaml + - data_files/tasks/cell_towers_2.yaml + insights: + - data_files/insights/add_topic.yaml + - data_files/insights/cell_towers_2_demo.yaml + - data_files/insights/liar_advice.yaml + task_insight_relevance: # Rows and columns represent (respectively) the tasks and insights listed above. + - [0, 0, 2] # 2 denotes a mutually relevant task-insight pair, stored in memory. + - [0, 0, 1] # 1 denotes a mutually relevant task-insight pair, not stored in memory. + - [2, 0, 0] # 0 denotes a mutually irrelevant task-insight pair. + - [0, 1, 0] + - [0, 2, 0] diff --git a/python/samples/task_centric_memory/configs/self_teaching.yaml b/python/samples/task_centric_memory/configs/self_teaching.yaml new file mode 100644 index 000000000000..a6e28eaa6245 --- /dev/null +++ b/python/samples/task_centric_memory/configs/self_teaching.yaml @@ -0,0 +1,31 @@ + +PageLogger: + level: DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL, or NONE. + path: ./pagelogs/self-teaching + +client: + model: gpt-4o-2024-08-06 + temperature: 0.8 + max_completion_tokens: 4096 + presence_penalty: 0.0 + frequency_penalty: 0.0 + top_p: 1.0 + max_retries: 65535 + +Apprentice: + name_of_agent_or_team: AssistantAgent # AssistantAgent or MagenticOneGroupChat + disable_prefix_caching: 1 # If true, prepends a small random string to the context, to decorrelate repeated runs. + TaskCentricMemoryController: + max_train_trials: 10 + max_test_trials: 3 + TaskCentricMemoryBank: + path: ./memory_bank/self_teaching + relevance_conversion_threshold: 1.7 + n_results: 25 + distance_threshold: 100 + +test: + task_file_1: data_files/tasks/10_liars.yaml # Train and test on this task. + task_file_2: data_files/tasks/100_vampires.yaml # Test generalization on this different, similar task. + num_loops: 10 + num_final_test_trials: 3 diff --git a/python/samples/task_centric_memory/configs/teachability.yaml b/python/samples/task_centric_memory/configs/teachability.yaml new file mode 100644 index 000000000000..70763ceeb908 --- /dev/null +++ b/python/samples/task_centric_memory/configs/teachability.yaml @@ -0,0 +1,29 @@ + +PageLogger: + level: DEBUG # DEBUG, INFO, WARNING, ERROR, CRITICAL, or NONE. + path: ./pagelogs/teachability + +client: + model: gpt-4o-2024-08-06 + temperature: 0.8 + max_completion_tokens: 4096 + presence_penalty: 0.0 + frequency_penalty: 0.0 + top_p: 1.0 + max_retries: 65535 + +Apprentice: + name_of_agent_or_team: AssistantAgent # AssistantAgent or MagenticOneGroupChat + disable_prefix_caching: 1 # If true, prepends a small random string to the context, to decorrelate repeated runs. + MemoryController: + max_train_trials: 10 + max_test_trials: 3 + MemoryBank: + path: ./memory_bank/teachability + relevance_conversion_threshold: 1.7 + n_results: 25 + distance_threshold: 100 + +test: + task_file: data_files/tasks/autogen_package.yaml # The task being tested. + insight_file: data_files/insights/add_topic.yaml # Advice provided to help solve the task. diff --git a/python/samples/task_centric_memory/data_files/insights/add_topic.yaml b/python/samples/task_centric_memory/data_files/insights/add_topic.yaml new file mode 100644 index 000000000000..05e334ccbf7e --- /dev/null +++ b/python/samples/task_centric_memory/data_files/insights/add_topic.yaml @@ -0,0 +1,6 @@ +# Advice created manually for solving the autogen_package task. + +insight: 'When somebody builds something on autogen and wants to contribute it to autogen, + instead of creating a new autogen package, it''s better for them to implement it in a separate github repo, + then just add the autogen-extension topic to that repo. + That way their contribution will be automatically discoverable through autogen''s page on community extensions: https://microsoft.github.io/autogen/dev/user-guide/extensions-user-guide/index.html' diff --git a/python/samples/task_centric_memory/data_files/insights/cell_towers_2_demo.yaml b/python/samples/task_centric_memory/data_files/insights/cell_towers_2_demo.yaml new file mode 100644 index 000000000000..ca00473067ef --- /dev/null +++ b/python/samples/task_centric_memory/data_files/insights/cell_towers_2_demo.yaml @@ -0,0 +1,11 @@ +# Demo created manually for solving the cell_towers_2 task. + +insight: 'Sort the houses by location: 3, 6, 10, 11, 12, 17, 19, 20. + Then start at one end and place the towers only where absolutely needed. + The house at 3 could be served by a tower as far away as mile marker 7, because 3 + 4 = 7, so place a tower at 7. + This obviously covers houses up to mile 7. + But a coverage radius of 4 miles (in each direction) means a total coverage of 8 miles. + So the tower at mile 7 would reach all the way to mile 11, covering the houses at 10 and 11. + The next uncovered house would be at mile 12 (not 10), requiring a second tower. + It could go at mile 16 (which is 12 + 4) and this tower would reach up to mile 20 (16 + 4), + covering the remaining houses. So 2 towers would be enough.' diff --git a/python/samples/task_centric_memory/data_files/insights/liar_advice.yaml b/python/samples/task_centric_memory/data_files/insights/liar_advice.yaml new file mode 100644 index 000000000000..502379d37896 --- /dev/null +++ b/python/samples/task_centric_memory/data_files/insights/liar_advice.yaml @@ -0,0 +1,6 @@ +# Advice created automatically for solving the 10_liars task. + +insight: 'When solving logic puzzles, carefully consider all possible scenarios, + including the simplest ones, and remember that if everyone is lying, + their statements should naturally align with the known conditions without needing a truth-teller. + Always double-check that your conclusions don''t inadvertently introduce contradictions.' diff --git a/python/samples/task_centric_memory/data_files/tasks/100_vampires.yaml b/python/samples/task_centric_memory/data_files/tasks/100_vampires.yaml new file mode 100644 index 000000000000..2e2341d91fd1 --- /dev/null +++ b/python/samples/task_centric_memory/data_files/tasks/100_vampires.yaml @@ -0,0 +1,22 @@ +# From GAIA L1 + +task_description: "You are Van Helsing, a renowned vampire hunter. A Count of Moldova, La\u021B\ + cu IV, son of Costea, has tasked you with investigating the village of \u0218\ + irnea in neighboring Wallachia. The Count's advisors have reported that a vampire\ + \ was spotted crossing the border near the village, and would like you to investigate\ + \ it.\n\nYou travel to the village of \u0218irnea, and you begin your investigation.\ + \ One night, just before dawn, you catch a glimpse of a man in a long black\ + \ cape with red lining leaping from roof-top to roof-top with superhuman agility.\ + \ It's a vampire! You try to chase the creature back to its home, but the creature\ + \ is too fast. However, because of the remoteness of the village, you know with\ + \ absolute certainty that the vampire must be a resident of the village. You\ + \ decide that your best course of action will be to visit all 100 residents\ + \ of the town during the day. You know something about vampires and humans that\ + \ will make your investigation possible; humans always tell the truth, but vampires\ + \ always lie.\n\nIn the afternoon, you go from house to house, speaking with\ + \ all 100 residents of \u0218irnea. You ask everyone the same question: \"How\ + \ many vampires are living in \u0218irnea\". Everyone in the village gives the\ + \ same response, \"At least one of us is a human.\"\n\nHow many residents of\ + \ \u0218irnea have been turned into vampires?" + +expected_answer: '100' diff --git a/python/samples/task_centric_memory/data_files/tasks/10_liars.yaml b/python/samples/task_centric_memory/data_files/tasks/10_liars.yaml new file mode 100644 index 000000000000..096e12775935 --- /dev/null +++ b/python/samples/task_centric_memory/data_files/tasks/10_liars.yaml @@ -0,0 +1,8 @@ +# Similar to the 100 vampires task, for testing generalization from one to the other. + +task_description: 'You ask ten people ''How many of you are liars?'' + They all answer ''At least one of us is not a liar.'' + You happen to know that at least one of them IS a liar. + How many of them are liars in total?' + +expected_answer: All of them are liars. diff --git a/python/samples/task_centric_memory/data_files/tasks/autogen_package.yaml b/python/samples/task_centric_memory/data_files/tasks/autogen_package.yaml new file mode 100644 index 000000000000..f80840b30073 --- /dev/null +++ b/python/samples/task_centric_memory/data_files/tasks/autogen_package.yaml @@ -0,0 +1,5 @@ +# Test where human advice is needed. + +task_description: As a contribution to autogen, can I create a new autogen package for a copilot extension agent that I built on autogen? + +expected_answer: It's best to have your agent in its own repo, then add the autogen-extension topic to that repo. diff --git a/python/samples/task_centric_memory/data_files/tasks/cell_towers_1.yaml b/python/samples/task_centric_memory/data_files/tasks/cell_towers_1.yaml new file mode 100644 index 000000000000..f86e370db3ee --- /dev/null +++ b/python/samples/task_centric_memory/data_files/tasks/cell_towers_1.yaml @@ -0,0 +1,9 @@ +# File-free version of a GAIA L1 task. + +task_description: You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. + Houses are located at mile markers 16, 17, 19, 11, 9, 10, 2, 5, 4. + Each cell phone tower can cover houses located next to the road within a 4-mile radius. + Find the minimum number of cell phone towers needed to cover all houses next to the road. + Your answer should be a positive numerical integer value. + +expected_answer: '2' diff --git a/python/samples/task_centric_memory/data_files/tasks/cell_towers_2.yaml b/python/samples/task_centric_memory/data_files/tasks/cell_towers_2.yaml new file mode 100644 index 000000000000..5ddc046920c9 --- /dev/null +++ b/python/samples/task_centric_memory/data_files/tasks/cell_towers_2.yaml @@ -0,0 +1,9 @@ +# Similar to the cell_towers_1 task. + +task_description: You are a telecommunications engineer who wants to build cell phone towers on a stretch of road. + Houses are located at mile markers 17, 20, 19, 10, 11, 12, 3, 6. + Each cell phone tower can cover houses located next to the road within a 4-mile radius. + Find the minimum number of cell phone towers needed to cover all houses next to the road. + Your answer should be a positive numerical integer value. + +expected_answer: '2' diff --git a/python/samples/task_centric_memory/eval_learning_from_demonstration.py b/python/samples/task_centric_memory/eval_learning_from_demonstration.py new file mode 100644 index 000000000000..e61c7ff7d3db --- /dev/null +++ b/python/samples/task_centric_memory/eval_learning_from_demonstration.py @@ -0,0 +1,110 @@ +import asyncio +import sys +from typing import Any, Dict + +from autogen_core.models import ( + ChatCompletionClient, +) +from autogen_ext.experimental.task_centric_memory.utils import Apprentice, Grader, PageLogger +from utils import create_oai_client, load_yaml_file + + +""" +This code sample connects task-centric memory to a selectable agent with no changes to that agent's code. +See the block diagram in the README for an overview of the components and their interactions. +See the config file configs/demonstration.yaml for an overall view of the structure and settings in this sample. + +Execute the sample with this command: + python eval_learning_from_demonstration.py configs/demonstration.yaml + +Here, to learn from a demonstration means to remember a previously demonstrated solution for the same or a similar task. + +1. The function below asks the agent to perform a reasoning task (ten times) on which it usually fails. +2. Then agent is then given one demonstration of how to solve a similar but different task, and the context window is cleared. +3. Finally the agent is tested 10 more times to see if it can retrieve and apply the demonstration to the original task. + +If adapting this sample code to a new setting, the Apprentice class can be used or completely replaced by other code. +""" + + +async def eval_learning_from_demonstration( + apprentice: Apprentice, client: ChatCompletionClient, logger: PageLogger, config: Dict[str, Any] +) -> str: + """ + Evaluates the ability to learn quickly from demonstrations. + """ + logger.enter_function() + + num_trials = config["num_trials"] + grader = Grader(client, logger) + + # Load the specified data. + main_task = load_yaml_file(config["main_task_file"]) + task_description = main_task["task_description"] + expected_answer = main_task["expected_answer"] + demo_task = load_yaml_file(config["demo_task_file"])["task_description"] + demo_solution = load_yaml_file(config["demo_solution_file"])["insight"] + + # Start by clearing memory then running a baseline test. + logger.info("To get a baseline, clear memory, then assign the task.") + apprentice.reset_memory() + num_successes, num_trials = await grader.test_apprentice( + apprentice=apprentice, + task_description=task_description, + expected_answer=expected_answer, + num_trials=num_trials, + use_memory=True, + client=client, + ) + success_rate = round((num_successes / num_trials) * 100) + results_str_1 = "Success rate before demonstration: {}%".format(success_rate) + logger.info("\n" + results_str_1) + + # Provide a demonstration for a similar but different task. + logger.info("Demonstrate a solution to a similar task.") + await apprentice.add_task_solution_pair_to_memory(demo_task, demo_solution) + + # Now test again to see if the demonstration (retrieved from memory) helps. + logger.info("Assign the task again to see if the demonstration helps.") + num_successes, num_trials = await grader.test_apprentice( + apprentice=apprentice, + task_description=task_description, + expected_answer=expected_answer, + num_trials=num_trials, + use_memory=True, + client=client, + ) + success_rate = round((num_successes / num_trials) * 100) + results_str_2 = "Success rate after demonstration: {}%".format(success_rate) + logger.info("\n" + results_str_2) + + logger.leave_function() + return "\neval_learning_from_demonstration\n" + results_str_1 + "\n" + results_str_2 + + +async def run_example(config_filepath: str) -> None: + """ + Runs the code example with the necessary components. + """ + config = load_yaml_file(config_filepath) + + # Create the necessary components. + logger = PageLogger(config["PageLogger"]) + client = create_oai_client(config["client"]) + apprentice = Apprentice(client, config["Apprentice"], logger) + + # Call the example function. + results = await eval_learning_from_demonstration(apprentice, client, logger, config["test"]) + + # Finish up. + print(results) + + +if __name__ == "__main__": + args = sys.argv[1:] + if len(args) != 1: + # Print usage information. + print("Usage: amt.py ") + else: + # Run the code example. + asyncio.run(run_example(config_filepath=args[0])) diff --git a/python/samples/task_centric_memory/eval_retrieval.py b/python/samples/task_centric_memory/eval_retrieval.py new file mode 100644 index 000000000000..b0d0273033db --- /dev/null +++ b/python/samples/task_centric_memory/eval_retrieval.py @@ -0,0 +1,118 @@ +import asyncio +import sys +from typing import Any, Dict, Set + +from autogen_core.models import ( + ChatCompletionClient, +) +from autogen_ext.experimental.task_centric_memory import MemoryController +from autogen_ext.experimental.task_centric_memory.utils import PageLogger +from utils import create_oai_client, load_yaml_file + + +""" +This code sample evaluates memory precision and recall, with no agent involved at all. +See the config file configs/retrieval.yaml for an overall view of the structure and settings in this sample, +as well as the data files used for the test. + +Execute the sample with this command: + python eval_retrieval.py configs/retrieval.yaml + +This sample shows how an app can access the `MemoryController` directly +to retrieve previously stored task-insight pairs as potentially useful examplars when solving some new task. +A task is any text instruction that the app may give to an agent. +An insight is any text (like a hint, advice, a demonstration or plan) that might help the agent perform such tasks. +""" + + +async def eval_retrieval( + memory_controller: MemoryController, client: ChatCompletionClient, logger: PageLogger, config: Dict[str, Any] +) -> str: + """ + Evaluates precision and recall of task-centric memory retrieval. + """ + logger.enter_function() + + # Load the specified data. + task_files = config["tasks"] + task_list = [load_yaml_file(task)["task_description"] for task in task_files] + + insight_files = config["insights"] + insight_list = [load_yaml_file(insight)["insight"] for insight in insight_files] + + task_insight_relevance = config["task_insight_relevance"] + + # Clear memory, then store the specified task-insight pairs. + memory_controller.reset_memory() + for ti, task in enumerate(task_list): + for ii, insight in enumerate(insight_list): + if task_insight_relevance[ti][ii] == 2: + await memory_controller.add_memo(task=task, insight=insight) + + # Test memory retrieval. + num_retrieved = 0 + num_relevant = 0 + num_relevant_and_retrieved = 0 + for ti, task in enumerate(task_list): + # Retrieve insights for this task. + memos = await memory_controller.retrieve_relevant_memos(task=task) + set_of_retrieved_insights = set(memo.insight for memo in memos) + + # Gather the insights that are relevant to this task according to ground truth. + set_of_relevant_insights: Set[str] = set() + for ii, insight in enumerate(insight_list): + if task_insight_relevance[ti][ii] > 0: + set_of_relevant_insights.add(insight) + + # Accumulate the counts. + num_retrieved += len(set_of_retrieved_insights) + num_relevant += len(set_of_relevant_insights) + num_relevant_and_retrieved += len(set_of_relevant_insights & set_of_retrieved_insights) + logger.info("\nNum retrieved: {}".format(num_retrieved)) + logger.info("\nNum relevant: {}".format(num_relevant)) + logger.info("\nNum relevant and retrieved: {}".format(num_relevant_and_retrieved)) + + # Compute precision and recall as percentages. + precision = num_relevant_and_retrieved / num_retrieved if num_retrieved > 0 else 0 + recall = num_relevant_and_retrieved / num_relevant if num_relevant > 0 else 0 + precision_str = "Precision: {:.3f}%".format(precision * 100) + recall_str = "Recall: {:.3f}%".format(recall * 100) + logger.info("\n" + precision_str) + logger.info("\n" + recall_str) + + logger.leave_function() + return "\neval_retrieval\n" + precision_str + "\n" + recall_str + + +async def run_example(config_filepath: str) -> None: + """ + Runs the code example with the necessary components. + """ + config = load_yaml_file(config_filepath) + + # Create the necessary components. + logger = PageLogger(config["PageLogger"]) + client = create_oai_client(config["client"]) + memory_controller = MemoryController( + reset=True, + client=client, + task_assignment_callback=None, + config=config["MemoryController"], + logger=logger, + ) + + # Call the example function. + results = await eval_retrieval(memory_controller, client, logger, config["test"]) + + # Finish up. + print(results) + + +if __name__ == "__main__": + args = sys.argv[1:] + if len(args) != 1: + # Print usage information. + print("Usage: amt.py ") + else: + # Run the code example. + asyncio.run(run_example(config_filepath=args[0])) diff --git a/python/samples/task_centric_memory/eval_self_teaching.py b/python/samples/task_centric_memory/eval_self_teaching.py new file mode 100644 index 000000000000..36a0ae24dae3 --- /dev/null +++ b/python/samples/task_centric_memory/eval_self_teaching.py @@ -0,0 +1,128 @@ +import asyncio +import sys +from typing import Any, Dict + +from autogen_core.models import ( + ChatCompletionClient, +) +from autogen_ext.experimental.task_centric_memory.utils import Apprentice, Grader, PageLogger + +from utils import create_oai_client, load_yaml_file + + +""" +This code sample connects task-centric memory to a selectable agent with no changes to that agent's code. +See the block diagram in the README for an overview of the components and their interactions. +See the config file configs/self_teaching.yaml for an overall view of the structure and settings in this sample. + +Execute the sample with this command: + python eval_self_teaching.py configs/self_teaching.yaml + +We say that an agent is self-teaching if it can learn quickly from its own trial and error with no user input. +This sample asks the agent to perform a reasoning task on which it usually fails. +Then using automatic success or failure feedback (for a verifiable task with no side-effects on the environment), +the agent iterates through a background learning loop to find a solution, which it then stores as an insight in memory. +Finally the agent is tested again to see if it can retrieve and apply its insight to the original task, +as well as to a similar but different task as a test of generalization. + +If adapting this sample code to a new setting, the Apprentice class can be used or completely replaced by other code. +""" + + +async def eval_self_teaching( + apprentice: Apprentice, client: ChatCompletionClient, logger: PageLogger, config: Dict[str, Any] +) -> str: + """ + Evaluates the ability of an agent to learn quickly from its own trial and error. + """ + logger.enter_function() + + num_loops = config["num_loops"] + num_final_test_trials = config["num_final_test_trials"] + grader = Grader(client, logger) + + # Load the specified data. + task_dict_1 = load_yaml_file(config["task_file_1"]) + task_description_1 = task_dict_1["task_description"] + expected_answer_1 = task_dict_1["expected_answer"] + + # Test generalization on this different, similar task. + task_dict_2 = load_yaml_file(config["task_file_2"]) + task_description_2 = task_dict_2["task_description"] + expected_answer_2 = task_dict_2["expected_answer"] + + # Start the test with empty memory. + apprentice.reset_memory() + + total_num_successes_1 = 0 + total_num_successes_2 = 0 + total_num_trials = 0 + for _ in range(num_loops): + # Train on the first task. + await apprentice.train_on_task(task=task_description_1, expected_answer=expected_answer_1) + + # Test on the first task. + num_successes, num_trials = await grader.test_apprentice( + apprentice=apprentice, + task_description=task_description_1, + expected_answer=expected_answer_1, + num_trials=num_final_test_trials, + use_memory=True, + client=client, + ) + logger.info("Task 1 success rate: {}%".format(round((num_successes / num_trials) * 100))) + total_num_successes_1 += num_successes + + # Test on the second task. + num_successes, num_trials = await grader.test_apprentice( + apprentice=apprentice, + task_description=task_description_2, + expected_answer=expected_answer_2, + num_trials=num_final_test_trials, + use_memory=True, + client=client, + ) + logger.info("Task 2 success rate: {}%".format(round((num_successes / num_trials) * 100))) + total_num_successes_2 += num_successes + + total_num_trials += num_final_test_trials + logger.info("") + + overall_success_rate_1 = round((total_num_successes_1 / total_num_trials) * 100) + overall_success_rate_2 = round((total_num_successes_2 / total_num_trials) * 100) + + results_str_1 = "Overall task 1 success rate: {}%".format(overall_success_rate_1) + results_str_2 = "Overall task 2 success rate: {}%".format(overall_success_rate_2) + logger.info("\n" + results_str_1) + logger.info(results_str_2) + + logger.leave_function() + return "\neval_self_teaching\n" + results_str_1 + "\n" + results_str_2 + + +async def run_example(config_filepath: str) -> None: + """ + Runs the code example with the necessary components. + """ + config = load_yaml_file(config_filepath) + + # Create the necessary components. + logger = PageLogger(config["PageLogger"]) + client = create_oai_client(config["client"]) + apprentice = Apprentice(client, config["Apprentice"], logger) + + # Call the example function. + results = await eval_self_teaching(apprentice, client, logger, config["test"]) + + # Finish up. + print(results) + + +if __name__ == "__main__": + args = sys.argv[1:] + if len(args) != 1: + # Print usage information. + print("Usage: amt.py ") + else: + # Run the code example. + asyncio.run(run_example(config_filepath=args[0])) diff --git a/python/samples/task_centric_memory/eval_teachability.py b/python/samples/task_centric_memory/eval_teachability.py new file mode 100644 index 000000000000..6bc7d6006258 --- /dev/null +++ b/python/samples/task_centric_memory/eval_teachability.py @@ -0,0 +1,112 @@ +import asyncio +import sys +from typing import Any, Dict + +from autogen_core.models import ( + ChatCompletionClient, +) +from autogen_ext.experimental.task_centric_memory.utils import Apprentice, Grader, PageLogger + +from utils import create_oai_client, load_yaml_file + + +""" +This code sample connects task-centric memory to a selectable agent with no changes to that agent's code. +See the block diagram in the README for an overview of the components and their interactions. +See the config file configs/eval_teachability.yaml for an overall view of the structure and settings in this sample. + +Execute the sample with this command: + python eval_teachability.py configs/eval_teachability.yaml + +Teachable agents use memory to learn quickly from user teachings, hints, and advice. +The function below passes user instructions (loaded from a file) to the agent by calling Apprentice.handle_user_message(). +If adapting this sample code to a new setting, the Apprentice class can be used or completely replaced by other code. + +1. In the first conversation, the agent is expected to fail because it lacks the necessary knowledge. +2. In the second conversation (starting with an empty context window), the user provides the missing insight. +3. In the third conversation, the agent is expected to succeed after retrieving the key insight from memory. +""" + + +async def eval_teachability( + apprentice: Apprentice, client: ChatCompletionClient, logger: PageLogger, config: Dict[str, Any] +) -> str: + """ + Evaluates the ability to learn quickly from user teachings, hints, and advice. + """ + logger.enter_function() + + # Load the specified data. + task_dict = load_yaml_file(config["task_file"]) + task_description = task_dict["task_description"] + expected_answer = task_dict["expected_answer"] + + insight_dict = load_yaml_file(config["insight_file"]) + insight = insight_dict["insight"] + + # First test without memory. + apprentice.reset_memory() + logger.info("\nClear memory, then ask the question.") + response = await apprentice.handle_user_message(task_description) + + # Check the response. + grader = Grader(client, logger) + response_is_correct, extracted_answer = await grader.is_response_correct( + task_description, response, expected_answer + ) + logger.info("Extracted answer: {}".format(extracted_answer)) + if response_is_correct: + results_str_1 = "Answer before teaching is CORRECT." + else: + results_str_1 = "Answer before teaching is INCORRECT." + logger.info(results_str_1 + "\n") + + # Give advice that should help solve this task. + logger.info("Give the advice.") + await apprentice.handle_user_message(insight) + + # Now ask the question again to see if the advice helps. + logger.info("\nAsk the question again to see if the advice helps.") + response = await apprentice.handle_user_message(task_description) + + # Check the response. + response_is_correct, extracted_answer = await grader.is_response_correct( + task_description, response, expected_answer + ) + logger.info("Extracted answer: {}".format(extracted_answer)) + if response_is_correct: + results_str_2 = "Answer after teaching is CORRECT." + else: + results_str_2 = "Answer after teaching is INCORRECT." + logger.info(results_str_2 + "\n") + + logger.leave_function() + return "\neval_teachability\n" + results_str_1 + "\n" + results_str_2 + + +async def run_example(config_filepath: str) -> None: + """ + Runs the code example with the necessary components. + """ + config = load_yaml_file(config_filepath) + + # Create the necessary components. + logger = PageLogger(config["PageLogger"]) + client = create_oai_client(config["client"]) + apprentice = Apprentice(client, config["Apprentice"], logger) + + # Call the example function. + results = await eval_teachability(apprentice, client, logger, config["test"]) + + # Finish up. + print(results) + + +if __name__ == "__main__": + args = sys.argv[1:] + if len(args) != 1: + # Print usage information. + print("Usage: amt.py ") + else: + # Run the code example. + asyncio.run(run_example(config_filepath=args[0])) diff --git a/python/samples/task_centric_memory/utils.py b/python/samples/task_centric_memory/utils.py new file mode 100644 index 000000000000..366b16c995e1 --- /dev/null +++ b/python/samples/task_centric_memory/utils.py @@ -0,0 +1,32 @@ +from typing import Any, Dict +import yaml + +from autogen_core.models import ( + ChatCompletionClient, +) +from autogen_ext.models.openai import OpenAIChatCompletionClient + + +def create_oai_client(config: Dict[str, Any]) -> ChatCompletionClient: + """ + Creates a chat completion client from OpenAI. + """ + client = OpenAIChatCompletionClient( + model=config["model"], + max_tokens=config["max_completion_tokens"], + max_retries=config["max_retries"], + temperature=config["temperature"], + presence_penalty=config["presence_penalty"], + frequency_penalty=config["frequency_penalty"], + top_p=config["top_p"], + ) + return client + + +def load_yaml_file(file_path: str) -> Any: + """ + Opens a file and returns its contents. + """ + with open(file_path, "r") as file: + return yaml.load(file, Loader=yaml.FullLoader) + diff --git a/python/uv.lock b/python/uv.lock index 666e5589c3db..454a7d7a2732 100644 --- a/python/uv.lock +++ b/python/uv.lock @@ -686,6 +686,9 @@ semantic-kernel-onnx = [ semantic-kernel-pandas = [ { name = "semantic-kernel", extra = ["pandas"] }, ] +task-centric-memory = [ + { name = "chromadb" }, +] video-surfer = [ { name = "autogen-agentchat" }, { name = "ffmpeg-python" }, @@ -719,6 +722,7 @@ requires-dist = [ { name = "azure-ai-inference", marker = "extra == 'azure'", specifier = ">=1.0.0b7" }, { name = "azure-core", marker = "extra == 'azure'" }, { name = "azure-identity", marker = "extra == 'azure'" }, + { name = "chromadb", marker = "extra == 'task-centric-memory'", specifier = ">=0.6.3" }, { name = "chromadb", marker = "extra == 'chromadb'" }, { name = "diskcache", marker = "extra == 'diskcache'", specifier = ">=5.6.3" }, { name = "docker", marker = "extra == 'docker'", specifier = "~=7.0" },