Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 228 additions & 0 deletions experiments/run_lightmen_qwen_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
from openai import OpenAI
import json
from tqdm import tqdm
import datetime
import time
import os
from lightmem.memory.lightmem import LightMemory

# ============ API Configuration ============
API_KEY = "empty"
API_BASE_URL = "your_qwen_server_url"
LLM_MODEL = "Qwen/Qwen-chat"
JUDGE_MODEL = "Qwen/Qwen-chat"

# ============ Model Paths ============
LLMLINGUA_MODEL_PATH = "/your/path/to/models/llmlingua-2-bert-base-multilingual-cased-meetingbank"
EMBEDDING_MODEL_PATH = "/your/path/to/models/all-MiniLM-L6-v2"
EMBEDDING_DIMS = 384
# ============ Data Configuration ============
DATA_PATH = "/your/path/to/LightMem/experiments/sample_data.json"
RESULTS_DIR = "./results"
QDRANT_DATA_DIR = "./qdrant_data"


def get_anscheck_prompt(task, question, answer, response, abstention=False):
if not abstention:
if task in ["single-session-user", "single-session-assistant", "multi-session"]:
template = "I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no. \n\nQuestion: {}\n\nCorrect Answer: {}\n\nModel Response: {}\n\nIs the model response correct? Answer yes or no only."
prompt = template.format(question, answer, response)
elif task == "temporal-reasoning":
template = "I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no. In addition, do not penalize off-by-one errors for the number of days. If the question asks for the number of days/weeks/months, etc., and the model makes off-by-one errors (e.g., predicting 19 days when the answer is 18), the model's response is still correct. \n\nQuestion: {}\n\nCorrect Answer: {}\n\nModel Response: {}\n\nIs the model response correct? Answer yes or no only."
prompt = template.format(question, answer, response)
elif task == "knowledge-update":
template = "I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response contains some previous information along with an updated answer, the response should be considered as correct as long as the updated answer is the required answer.\n\nQuestion: {}\n\nCorrect Answer: {}\n\nModel Response: {}\n\nIs the model response correct? Answer yes or no only."
prompt = template.format(question, answer, response)
elif task == "single-session-preference":
template = "I will give you a question, a rubric for desired personalized response, and a response from a model. Please answer yes if the response satisfies the desired response. Otherwise, answer no. The model does not need to reflect all the points in the rubric. The response is correct as long as it recalls and utilizes the user's personal information correctly.\n\nQuestion: {}\n\nRubric: {}\n\nModel Response: {}\n\nIs the model response correct? Answer yes or no only."
prompt = template.format(question, answer, response)
else:
raise NotImplementedError
else:
template = "I will give you an unanswerable question, an explanation, and a response from a model. Please answer yes if the model correctly identifies the question as unanswerable. The model could say that the information is incomplete, or some other information is given but the asked information is not.\n\nQuestion: {}\n\nExplanation: {}\n\nModel Response: {}\n\nDoes the model correctly identify the question as unanswerable? Answer yes or no only."
prompt = template.format(question, answer, response)
return prompt


def true_or_false(response):
if response is None:
return False
normalized = str(response).strip().lower()
if not normalized:
return False
first_line = normalized.splitlines()[0].strip()
tokens = first_line.replace(".", "").replace("!", "").replace(":", "").replace(";", "").split()
if not tokens:
return False
head = tokens[0]
if head in ("yes", "y"):
return True
if head in ("no", "n"):
return False
if "yes" in first_line:
return True
if "no" in first_line:
return False
return False


class LLMModel:
def __init__(self, model_name, api_key, base_url):
self.name = model_name
self.api_key = api_key
self.base_url = base_url
self.max_tokens = 2000
self.temperature = 0.0
self.top_p = 0.8
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)

def call(self, messages: list, **kwargs):
max_retries = kwargs.get("max_retries", 3)

for attempt in range(max_retries):
try:
completion = self.client.chat.completions.create(
model=self.name,
messages=messages,
max_tokens=self.max_tokens,
temperature=self.temperature,
top_p=self.top_p,
stream=False,
)
response = completion.choices[0].message.content
print(response)
return response
except Exception as e:
if attempt == max_retries - 1:
raise


def load_lightmem(collection_name):
config = {
"pre_compress": True,
"pre_compressor": {
"model_name": "llmlingua-2",
"configs": {
"llmlingua_config": {
"model_name": LLMLINGUA_MODEL_PATH,
"device_map": "cpu",
"use_llmlingua2": True,
},
},
},
"topic_segment": True,
"precomp_topic_shared": True,
"topic_segmenter": {
"model_name": "llmlingua-2",
},
"messages_use": "user_only",
"metadata_generate": True,
"text_summary": True,
"memory_manager": {
"model_name": "openai",
"configs": {
"model": LLM_MODEL,
"api_key": API_KEY,
"max_tokens": 16000,
"openai_base_url": API_BASE_URL,
},
},
"extract_threshold": 0.1,
"index_strategy": "embedding",
"text_embedder": {
"model_name": "huggingface",
"configs": {
"model": EMBEDDING_MODEL_PATH,
"embedding_dims": EMBEDDING_DIMS,
"model_kwargs": {"device": "cpu"},
},
},
"retrieve_strategy": "embedding",
"embedding_retriever": {
"model_name": "qdrant",
"configs": {
"collection_name": collection_name,
"embedding_model_dims": EMBEDDING_DIMS,
"path": f"{QDRANT_DATA_DIR}/{collection_name}",
},
},
"update": "offline",
}
lightmem = LightMemory.from_config(config)
return lightmem


llm_judge = LLMModel(JUDGE_MODEL, API_KEY, API_BASE_URL)
llm = LLMModel(LLM_MODEL, API_KEY, API_BASE_URL)

data = json.load(open(DATA_PATH, "r"))
data = data[:10]

INIT_RESULT = {"add_input_prompt": [], "add_output_prompt": [], "api_call_nums": 0}

for item in tqdm(data):
print(item["question_id"])
lightmem = load_lightmem(collection_name=item["question_id"])
sessions = item["haystack_sessions"]
timestamps = item["haystack_dates"]

results_list = []

time_start = time.time()
for session, timestamp in zip(sessions, timestamps):
while session and session[0]["role"] != "user":
session.pop(0)
num_turns = len(session) // 2
for turn_idx in range(num_turns):
turn_messages = session[turn_idx * 2 : turn_idx * 2 + 2]
if len(turn_messages) < 2 or turn_messages[0]["role"] != "user" or turn_messages[1]["role"] != "assistant":
continue
for msg in turn_messages:
msg["time_stamp"] = timestamp
is_last_turn = session is sessions[-1] and turn_idx == num_turns - 1
result = lightmem.add_memory(
messages=turn_messages,
force_segment=is_last_turn,
force_extract=is_last_turn,
)
if result != INIT_RESULT:
results_list.append(result)

time_end = time.time()
construction_time = time_end - time_start

related_memories = lightmem.retrieve(item["question"], limit=20)
messages = []
messages.append({"role": "system", "content": "You are a helpful assistant."})
messages.append(
{
"role": "user",
"content": f"Question time:{item['question_date']} and question:{item['question']}\nPlease answer the question based on the following memories: {str(related_memories)}",
}
)
generated_answer = llm.call(messages)

if "abs" in item["question_id"]:
prompt = get_anscheck_prompt(
item["question_type"], item["question"], item["answer"], generated_answer, abstention=True
)
else:
prompt = get_anscheck_prompt(item["question_type"], item["question"], item["answer"], generated_answer)
messages = [{"role": "user", "content": prompt}]
response = llm_judge.call(messages)

correct = 1 if true_or_false(response) else 0

save_data = {
"question_id": item["question_id"],
"results": results_list,
"construction_time": construction_time,
"generated_answer": generated_answer,
"ground_truth": item["answer"],
"correct": correct,
}

filename = f"../results/result_{item['question_id']}.json"
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, "w", encoding="utf-8") as f:
json.dump(save_data, f, ensure_ascii=False, indent=4)
137 changes: 137 additions & 0 deletions experiments/run_lightmen_qwen_server_README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# Evaluating Long-Term Memory for LLMs with LightMem

This project demonstrates a robust system for endowing Large Language Models (LLMs) with long-term memory capabilities using the **LightMem** framework. By integrating a locally hosted Qwen model via an OpenAI-compatible API, this experiment showcases how to overcome the inherent limitations of finite context windows, enabling conversations that are context-aware, persistent, and personalized across multiple sessions.

The included script runs a series of tests to evaluate the system's performance on three critical long-term memory tasks:
1. **Multi-Session Context Consolidation**
2. **Dynamic Knowledge Updates**
3. **Handling of Unanswerable Questions**

## 🏛️ Architecture

The system operates on a Retrieval-Augmented Generation (RAG) architecture orchestrated by LightMem:

- **Memory Framework:** **LightMem** manages the entire lifecycle of memories: ingestion, compression, storage, retrieval, and updating.
- **Reasoning Engine (LLM):** **Qwen/Qwen3-Next-80B-A3B-Instruct** (or any other compatible model) served locally, responsible for understanding user queries and generating responses based on retrieved memories.
- **Memory Compressor:** **`llmlingua-2`** reduces the token footprint of conversational history, making storage and retrieval more efficient.
- **Embedding Model:** **`all-MiniLM-L6-v2`** transforms textual memories into dense vector representations for semantic search.
- **Vector Store:** **Qdrant** provides a high-performance local database for storing and querying memory vectors.

## ✨ Key Features Demonstrated

- **Cross-Session Information Recall:** The system can synthesize information provided in separate conversations to answer a complex query.
- **Stateful Knowledge Management:** It correctly identifies and uses the most recent information when user preferences or facts change over time.
- **Knowledge Boundary Awareness:** The model can recognize when it lacks the necessary information to answer a question and gracefully abstains from responding, preventing factual hallucination.

## 🚀 Getting Started

Follow these steps to set up and run the experiment on your own machine.

### 1. Prerequisites

- Python 3.8+
- An OpenAI-compatible API server running a model like Qwen. You can set this up using tools like [vLLM](https://github.com/vllm-project/vllm) or [FastChat](https://github.com/lm-sys/FastChat).
- The required Hugging Face models downloaded locally.

### 2. Installation

1. **Clone the repository:**
```bash
git clone https://github.com/zjunlp/LightMem.git
cd LightMem
```

2. **Create a Python virtual environment (recommended):**
```bash
# Using conda
conda create -n lightmem python=3.10
conda activate lightmem

# Or using venv
python -m venv venv
source venv/bin/activate # On Windows: venv\Scripts\activate
```

3. **Install dependencies:**
```bash
pip install -r requirements.txt
```
*(If you don't have a `requirements.txt` file, create one with the following content)*:

```
lightmem
openai
tqdm
```

### 3. Configuration

Before running the script, you **must** update the configuration variables at the top of `experiments/run_lightmen_qwen_server.py`:

```python
# ============ API Configuration ============
# ⚠️ UPDATE THIS to your LLM server's endpoint
API_BASE_URL = "your_qwen_server_url"

# ============ Model Paths ============
# ⚠️ UPDATE THESE paths to where you have downloaded the models
LLMLINGUA_MODEL_PATH = "/your/path/to/models/llmlingua-2-bert-base-multilingual-cased-meetingbank"
EMBEDDING_MODEL_PATH = "/your/path/to/models/all-MiniLM-L6-v2"

# ============ Data Configuration ============
# This points to the sample data file
DATA_PATH = "/your/path/to/LightMem/experiments/sample_data.json"
```

### 4. Running the Experiment

Navigate to the directory containing the script and execute it:

```bash
# It is recommended to run from the root directory of the project
python experiments/run_lightmen_qwen_server.py
```

## 📊 Expected Output

As the script runs, you will see the following:

1. **Console Output:** A `tqdm` progress bar will show the progress through the three test cases. The generated answers from the LLM and the "yes/no" judgment will be printed to the console for each case.

```
0%| | 0/3 [00:00<?, ?it/s]
multi_session_trip_planning_01
...
Based on the memories provided: ... You decided on Tokyo ... in October.
yes
33%|█████████▋ | 1/3 [00:16<00:32, 16.45s/it]
...
100%|████████████████████████████████████████████| 3/3 [00:23<00:00, 7.94s/it]
```

2. **Generated Files:** Two new directories will be created in your project root:

- `./qdrant_data/`: Contains the local Qdrant database files where the vectorized memories are stored for each test case.
- `./results/`: Contains detailed JSON output files for each test case (e.g., `result_multi_session_trip_planning_01.json`), logging the prompts, generated answers, ground truth, and correctness score.

## 🔬 Understanding the Test Cases

The evaluation is performed using the `sample_data.json` file, which contains three distinct scenarios.

### Test Case 1: `multi_session_trip_planning_01`

- **Scenario:** A user first expresses interest in a trip to a big city, and in a later session, decides on Tokyo and the month of October.
- **Objective:** To test if the system can retrieve and combine facts from different conversational sessions.
- **Outcome:** The system correctly answers that the user decided on **Tokyo** for a trip in **October**, demonstrating successful context consolidation.

### Test Case 2: `knowledge_update_favorite_color_02`

- **Scenario:** A user first states their favorite color is "blue," but in a later session, updates it to "green."
- **Objective:** To test the system's ability to handle conflicting information and prioritize the most recent update.
- **Outcome:** The system correctly identifies the user's current favorite color as **green**, proving its capability to manage dynamic, evolving knowledge.

### Test Case 3: `multi_session_trip_planning_abs_01`

- **Scenario:** Following the trip planning conversations, the user asks for the name of the hotel they booked. This information was never mentioned.
- **Objective:** To test if the system can recognize the absence of information in its memory and avoid making things up (hallucinating).
- **Outcome:** The system correctly states that **no information about a hotel booking is available**, demonstrating robust knowledge boundary detection.
Loading