zjunlp · 7Eternity · Nov 4, 2025
diff --git a/experiments/run_lightmen_qwen_server.py b/experiments/run_lightmen_qwen_server.py
@@ -0,0 +1,228 @@
+from openai import OpenAI
+import json
+from tqdm import tqdm
+import datetime
+import time
+import os
+from lightmem.memory.lightmem import LightMemory
+
+# ============ API Configuration ============
+API_KEY = "empty"
+API_BASE_URL = "your_qwen_server_url"
+LLM_MODEL = "Qwen/Qwen-chat"
+JUDGE_MODEL = "Qwen/Qwen-chat"
+
+# ============ Model Paths ============
+LLMLINGUA_MODEL_PATH = "/your/path/to/models/llmlingua-2-bert-base-multilingual-cased-meetingbank"
+EMBEDDING_MODEL_PATH = "/your/path/to/models/all-MiniLM-L6-v2"
+EMBEDDING_DIMS = 384
+# ============ Data Configuration ============
+DATA_PATH = "/your/path/to/LightMem/experiments/sample_data.json"
+RESULTS_DIR = "./results"
+QDRANT_DATA_DIR = "./qdrant_data"
+
+
+def get_anscheck_prompt(task, question, answer, response, abstention=False):
+    if not abstention:
+        if task in ["single-session-user", "single-session-assistant", "multi-session"]:
+            template = "I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no. \n\nQuestion: {}\n\nCorrect Answer: {}\n\nModel Response: {}\n\nIs the model response correct? Answer yes or no only."
+            prompt = template.format(question, answer, response)
+        elif task == "temporal-reasoning":
+            template = "I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no. In addition, do not penalize off-by-one errors for the number of days. If the question asks for the number of days/weeks/months, etc., and the model makes off-by-one errors (e.g., predicting 19 days when the answer is 18), the model's response is still correct. \n\nQuestion: {}\n\nCorrect Answer: {}\n\nModel Response: {}\n\nIs the model response correct? Answer yes or no only."
+            prompt = template.format(question, answer, response)
+        elif task == "knowledge-update":
+            template = "I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response contains some previous information along with an updated answer, the response should be considered as correct as long as the updated answer is the required answer.\n\nQuestion: {}\n\nCorrect Answer: {}\n\nModel Response: {}\n\nIs the model response correct? Answer yes or no only."
+            prompt = template.format(question, answer, response)
+        elif task == "single-session-preference":
+            template = "I will give you a question, a rubric for desired personalized response, and a response from a model. Please answer yes if the response satisfies the desired response. Otherwise, answer no. The model does not need to reflect all the points in the rubric. The response is correct as long as it recalls and utilizes the user's personal information correctly.\n\nQuestion: {}\n\nRubric: {}\n\nModel Response: {}\n\nIs the model response correct? Answer yes or no only."
+            prompt = template.format(question, answer, response)
+        else:
+            raise NotImplementedError
+    else:
+        template = "I will give you an unanswerable question, an explanation, and a response from a model. Please answer yes if the model correctly identifies the question as unanswerable. The model could say that the information is incomplete, or some other information is given but the asked information is not.\n\nQuestion: {}\n\nExplanation: {}\n\nModel Response: {}\n\nDoes the model correctly identify the question as unanswerable? Answer yes or no only."
+        prompt = template.format(question, answer, response)
+    return prompt
+
+
+def true_or_false(response):
+    if response is None:
+        return False
+    normalized = str(response).strip().lower()
+    if not normalized:
+        return False
+    first_line = normalized.splitlines()[0].strip()
+    tokens = first_line.replace(".", "").replace("!", "").replace(":", "").replace(";", "").split()
+    if not tokens:
+        return False
+    head = tokens[0]
+    if head in ("yes", "y"):
+        return True
+    if head in ("no", "n"):
+        return False
+    if "yes" in first_line:
+        return True
+    if "no" in first_line:
+        return False
+    return False
+
+
+class LLMModel:
+    def __init__(self, model_name, api_key, base_url):
+        self.name = model_name
+        self.api_key = api_key
+        self.base_url = base_url
+        self.max_tokens = 2000
+        self.temperature = 0.0
+        self.top_p = 0.8
+        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+
+    def call(self, messages: list, **kwargs):
+        max_retries = kwargs.get("max_retries", 3)
+
+        for attempt in range(max_retries):
+            try:
+                completion = self.client.chat.completions.create(
+                    model=self.name,
+                    messages=messages,
+                    max_tokens=self.max_tokens,
+                    temperature=self.temperature,
+                    top_p=self.top_p,
+                    stream=False,
+                )
+                response = completion.choices[0].message.content
+                print(response)
+                return response
+            except Exception as e:
+                if attempt == max_retries - 1:
+                    raise
+
+
+def load_lightmem(collection_name):
+    config = {
+        "pre_compress": True,
+        "pre_compressor": {
+            "model_name": "llmlingua-2",
+            "configs": {
+                "llmlingua_config": {
+                    "model_name": LLMLINGUA_MODEL_PATH,
+                    "device_map": "cpu",
+                    "use_llmlingua2": True,
+                },
+            },
+        },
+        "topic_segment": True,
+        "precomp_topic_shared": True,
+        "topic_segmenter": {
+            "model_name": "llmlingua-2",
+        },
+        "messages_use": "user_only",
+        "metadata_generate": True,
+        "text_summary": True,
+        "memory_manager": {
+            "model_name": "openai",
+            "configs": {
+                "model": LLM_MODEL,
+                "api_key": API_KEY,
+                "max_tokens": 16000,
+                "openai_base_url": API_BASE_URL,
+            },
+        },
+        "extract_threshold": 0.1,
+        "index_strategy": "embedding",
+        "text_embedder": {
+            "model_name": "huggingface",
+            "configs": {
+                "model": EMBEDDING_MODEL_PATH,
+                "embedding_dims": EMBEDDING_DIMS,
+                "model_kwargs": {"device": "cpu"},
+            },
+        },
+        "retrieve_strategy": "embedding",
+        "embedding_retriever": {
+            "model_name": "qdrant",
+            "configs": {
+                "collection_name": collection_name,
+                "embedding_model_dims": EMBEDDING_DIMS,
+                "path": f"{QDRANT_DATA_DIR}/{collection_name}",
+            },
+        },
+        "update": "offline",
+    }
+    lightmem = LightMemory.from_config(config)
+    return lightmem
+
+
+llm_judge = LLMModel(JUDGE_MODEL, API_KEY, API_BASE_URL)
+llm = LLMModel(LLM_MODEL, API_KEY, API_BASE_URL)
+
+data = json.load(open(DATA_PATH, "r"))
+data = data[:10]
+
+INIT_RESULT = {"add_input_prompt": [], "add_output_prompt": [], "api_call_nums": 0}
+
+for item in tqdm(data):
+    print(item["question_id"])
+    lightmem = load_lightmem(collection_name=item["question_id"])
+    sessions = item["haystack_sessions"]
+    timestamps = item["haystack_dates"]
+
+    results_list = []
+
+    time_start = time.time()
+    for session, timestamp in zip(sessions, timestamps):
+        while session and session[0]["role"] != "user":
+            session.pop(0)
+        num_turns = len(session) // 2
+        for turn_idx in range(num_turns):
+            turn_messages = session[turn_idx * 2 : turn_idx * 2 + 2]
+            if len(turn_messages) < 2 or turn_messages[0]["role"] != "user" or turn_messages[1]["role"] != "assistant":
+                continue
+            for msg in turn_messages:
+                msg["time_stamp"] = timestamp
+            is_last_turn = session is sessions[-1] and turn_idx == num_turns - 1
+            result = lightmem.add_memory(
+                messages=turn_messages,
+                force_segment=is_last_turn,
+                force_extract=is_last_turn,
+            )
+            if result != INIT_RESULT:
+                results_list.append(result)
+
+    time_end = time.time()
+    construction_time = time_end - time_start
+
+    related_memories = lightmem.retrieve(item["question"], limit=20)
+    messages = []
+    messages.append({"role": "system", "content": "You are a helpful assistant."})
+    messages.append(
+        {
+            "role": "user",
+            "content": f"Question time:{item['question_date']} and question:{item['question']}\nPlease answer the question based on the following memories: {str(related_memories)}",
+        }
+    )
+    generated_answer = llm.call(messages)
+
+    if "abs" in item["question_id"]:
+        prompt = get_anscheck_prompt(
+            item["question_type"], item["question"], item["answer"], generated_answer, abstention=True
+        )
+    else:
+        prompt = get_anscheck_prompt(item["question_type"], item["question"], item["answer"], generated_answer)
+    messages = [{"role": "user", "content": prompt}]
+    response = llm_judge.call(messages)
+
+    correct = 1 if true_or_false(response) else 0
+
+    save_data = {
+        "question_id": item["question_id"],
+        "results": results_list,
+        "construction_time": construction_time,
+        "generated_answer": generated_answer,
+        "ground_truth": item["answer"],
+        "correct": correct,
+    }
+
+    filename = f"../results/result_{item['question_id']}.json"
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    with open(filename, "w", encoding="utf-8") as f:
+        json.dump(save_data, f, ensure_ascii=False, indent=4)
diff --git a/experiments/run_lightmen_qwen_server_README.md b/experiments/run_lightmen_qwen_server_README.md
@@ -0,0 +1,137 @@
+# Evaluating Long-Term Memory for LLMs with LightMem
+
+This project demonstrates a robust system for endowing Large Language Models (LLMs) with long-term memory capabilities using the **LightMem** framework. By integrating a locally hosted Qwen model via an OpenAI-compatible API, this experiment showcases how to overcome the inherent limitations of finite context windows, enabling conversations that are context-aware, persistent, and personalized across multiple sessions.
+
+The included script runs a series of tests to evaluate the system's performance on three critical long-term memory tasks:
+1.  **Multi-Session Context Consolidation**
+2.  **Dynamic Knowledge Updates**
+3.  **Handling of Unanswerable Questions**
+
+## 🏛️ Architecture
+
+The system operates on a Retrieval-Augmented Generation (RAG) architecture orchestrated by LightMem:
+
+-   **Memory Framework:** **LightMem** manages the entire lifecycle of memories: ingestion, compression, storage, retrieval, and updating.
+-   **Reasoning Engine (LLM):** **Qwen/Qwen3-Next-80B-A3B-Instruct** (or any other compatible model) served locally, responsible for understanding user queries and generating responses based on retrieved memories.
+-   **Memory Compressor:** **`llmlingua-2`** reduces the token footprint of conversational history, making storage and retrieval more efficient.
+-   **Embedding Model:** **`all-MiniLM-L6-v2`** transforms textual memories into dense vector representations for semantic search.
+-   **Vector Store:** **Qdrant** provides a high-performance local database for storing and querying memory vectors.
+
+## ✨ Key Features Demonstrated
+
+-   **Cross-Session Information Recall:** The system can synthesize information provided in separate conversations to answer a complex query.
+-   **Stateful Knowledge Management:** It correctly identifies and uses the most recent information when user preferences or facts change over time.
+-   **Knowledge Boundary Awareness:** The model can recognize when it lacks the necessary information to answer a question and gracefully abstains from responding, preventing factual hallucination.
+
+## 🚀 Getting Started
+
+Follow these steps to set up and run the experiment on your own machine.
+
+### 1. Prerequisites
+
+-   Python 3.8+
+-   An OpenAI-compatible API server running a model like Qwen. You can set this up using tools like [vLLM](https://github.com/vllm-project/vllm) or [FastChat](https://github.com/lm-sys/FastChat).
+-   The required Hugging Face models downloaded locally.
+
+### 2. Installation
+
+1.  **Clone the repository:**
+    ```bash
+    git clone https://github.com/zjunlp/LightMem.git
+    cd LightMem
+    ```
+
+2.  **Create a Python virtual environment (recommended):**
+    ```bash
+    # Using conda
+    conda create -n lightmem python=3.10
+    conda activate lightmem
+
+    # Or using venv
+    python -m venv venv
+    source venv/bin/activate  # On Windows: venv\Scripts\activate
+    ```
+
+3.  **Install dependencies:**
+    ```bash
+    pip install -r requirements.txt
+    ```
+    *(If you don't have a `requirements.txt` file, create one with the following content)*:
+
+    ```
+    lightmem
+    openai
+    tqdm
+    ```
+
+### 3. Configuration
+
+Before running the script, you **must** update the configuration variables at the top of `experiments/run_lightmen_qwen_server.py`:
+
+```python
+# ============ API Configuration ============
+# ⚠️ UPDATE THIS to your LLM server's endpoint
+API_BASE_URL = "your_qwen_server_url"
+
+# ============ Model Paths ============
+# ⚠️ UPDATE THESE paths to where you have downloaded the models
+LLMLINGUA_MODEL_PATH = "/your/path/to/models/llmlingua-2-bert-base-multilingual-cased-meetingbank"
+EMBEDDING_MODEL_PATH = "/your/path/to/models/all-MiniLM-L6-v2"
+
+# ============ Data Configuration ============
+# This points to the sample data file
+DATA_PATH = "/your/path/to/LightMem/experiments/sample_data.json"
+```
+
+### 4. Running the Experiment
+
+Navigate to the directory containing the script and execute it:
+
+```bash
+# It is recommended to run from the root directory of the project
+python experiments/run_lightmen_qwen_server.py
+```
+
+## 📊 Expected Output
+
+As the script runs, you will see the following:
+
+1.  **Console Output:** A `tqdm` progress bar will show the progress through the three test cases. The generated answers from the LLM and the "yes/no" judgment will be printed to the console for each case.
+
+    ```
+    0%|                                                              | 0/3 [00:00<?, ?it/s]
+    multi_session_trip_planning_01
+    ...
+    Based on the memories provided: ... You decided on Tokyo ... in October.
+    yes
+    33%|█████████▋                                         | 1/3 [00:16<00:32, 16.45s/it]
+    ...
+    100%|████████████████████████████████████████████| 3/3 [00:23<00:00,  7.94s/it]
+    ```
+
+2.  **Generated Files:** Two new directories will be created in your project root:
+
+    -   `./qdrant_data/`: Contains the local Qdrant database files where the vectorized memories are stored for each test case.
+    -   `./results/`: Contains detailed JSON output files for each test case (e.g., `result_multi_session_trip_planning_01.json`), logging the prompts, generated answers, ground truth, and correctness score.
+
+## 🔬 Understanding the Test Cases
+
+The evaluation is performed using the `sample_data.json` file, which contains three distinct scenarios.
+
+### Test Case 1: `multi_session_trip_planning_01`
+
+-   **Scenario:** A user first expresses interest in a trip to a big city, and in a later session, decides on Tokyo and the month of October.
+-   **Objective:** To test if the system can retrieve and combine facts from different conversational sessions.
+-   **Outcome:** The system correctly answers that the user decided on **Tokyo** for a trip in **October**, demonstrating successful context consolidation.
+
+### Test Case 2: `knowledge_update_favorite_color_02`
+
+-   **Scenario:** A user first states their favorite color is "blue," but in a later session, updates it to "green."
+-   **Objective:** To test the system's ability to handle conflicting information and prioritize the most recent update.
+-   **Outcome:** The system correctly identifies the user's current favorite color as **green**, proving its capability to manage dynamic, evolving knowledge.
+
+### Test Case 3: `multi_session_trip_planning_abs_01`
+
+-   **Scenario:** Following the trip planning conversations, the user asks for the name of the hotel they booked. This information was never mentioned.
+-   **Objective:** To test if the system can recognize the absence of information in its memory and avoid making things up (hallucinating).
+-   **Outcome:** The system correctly states that **no information about a hotel booking is available**, demonstrating robust knowledge boundary detection.