Skip to content

Commit c4cb46e

Browse files
committed
feat(ragflow-knowledge): 新增知识库管理脚本并更新文档
新增 `list_datasets.py` 脚本,用于列出所有可用知识库以支持智能路由。 新增 `batch_update.py` 脚本,用于批量更新预定义知识库的描述。 新增 `update_description.py` 脚本,用于更新单个知识库的描述。 更新 SKILL.md 文档,新增“列出可用知识库”章节并优化存储流程说明。
1 parent 0f04ea4 commit c4cb46e

File tree

4 files changed

+239
-3
lines changed

4 files changed

+239
-3
lines changed

skills/ragflow-knowledge/SKILL.md

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,23 +41,41 @@ python3 skills/ragflow-knowledge/scripts/retrieve_knowledge.py --query "您的
4141
- **Query**: 要搜索的问题或主题。
4242
- **Chat ID**: (可选)使用的特定聊天 ID。
4343

44-
### 3. 存储知识(保存)
44+
### 3. 列出可用知识库
45+
46+
在决定将知识存入何处之前,先列出当前可用的知识库,以便根据内容主题选择最合适的目标。
47+
48+
```bash
49+
python3 skills/ragflow-knowledge/scripts/list_datasets.py
50+
```
51+
52+
### 4. 存储知识(保存)
4553

4654
当用户明确要求将内容(文本、代码或文件)保存到知识库时使用。
4755

56+
**智能路由策略**
57+
1. 首先调用 `list_datasets.py` 获取所有知识库列表。
58+
2. 分析用户提供的知识内容(主题、类型、敏感度)。
59+
3. 将内容匹配到最合适的知识库(例如:技术文档 -> "Technical Docs", HR政策 -> "HR Policies")。
60+
4. 调用 `save_knowledge.py` 并指定 `dataset_name``dataset_id`
61+
5. 如果无法确定,可以询问用户或存入默认知识库(如 "General Knowledge")。
62+
4863
```bash
49-
python3 skills/ragflow-knowledge/scripts/save_knowledge.py --content "要保存的内容" [--file_path "/文件/路径"] [--dataset_name "我的知识库"]
64+
python3 skills/ragflow-knowledge/scripts/save_knowledge.py --content "要保存的内容" [--file_path "/文件/路径"] [--dataset_name "目标知识库名称"]
5065
```
5166

5267
- **Content**: 直接保存的文本内容。
5368
- **File Path**: 要上传的本地文件路径。
54-
- **Dataset Name**: (可选)保存到的数据集名称。默认为 "TraeKnowledge"
69+
- **Dataset Name**: (可选)保存到的数据集名称。如果不指定,脚本将尝试使用默认值或创建新库
5570

5671
## 脚本详情
5772

5873
### `check_config.py`
5974
验证与 RAGFlow 的连接并列出可用的数据集/聊天。这有助于确保环境准备就绪。
6075

76+
### `list_datasets.py`
77+
返回所有可用知识库的详细列表(JSON格式),包含ID、名称、描述和文档数量。Agent 应使用此输出来决定存储目标。
78+
6179
### `retrieve_knowledge.py`
6280
向 RAGFlow Chat API 发送查询并返回答案/上下文。
6381

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import os
2+
import requests
3+
import sys
4+
import json
5+
from utils import load_config, get_base_url
6+
7+
# Define the updates
8+
UPDATES = {
9+
"TML-技术文档": "存放团队核心技术沉淀,包括领域驱动设计(DDD)、架构设计模式、技术规范、技术方案及论文。",
10+
"TML-团队规范": "团队协作标准与管理流程,包括代码编写规范、Git 提交规范、研发流程管理制度、考勤与行政规则。",
11+
"TML-公共资源": "团队公共资产与基础设施信息,包括服务器资产列表、脱敏后的账号管理、常用软件工具包下载地址、常见问题解答(FAQ)。",
12+
"TML-项目文档": "具体项目的全生命周期文档,包括需求规格说明书(PRD)、项目排期表、会议纪要、测试报告与验收文档。",
13+
"TML-知识库": "非结构化或未分类的通用知识,包括行业动态、竞品情报、外部采集的参考资料及碎片化信息。"
14+
}
15+
16+
def batch_update():
17+
config = load_config()
18+
base_url = config.get("RAGFLOW_BASE_URL")
19+
api_key = config.get("RAGFLOW_API_KEY")
20+
21+
if not base_url or not api_key:
22+
print("Error: RAGFlow configuration missing.")
23+
sys.exit(1)
24+
25+
full_url = get_base_url(base_url)
26+
headers = {
27+
"Authorization": f"Bearer {api_key}",
28+
"Content-Type": "application/json"
29+
}
30+
31+
print("Fetching existing datasets...")
32+
try:
33+
response = requests.get(f"{full_url}/datasets?page=1&page_size=100", headers=headers)
34+
if response.status_code != 200:
35+
print(f"Error listing datasets: {response.status_code}")
36+
sys.exit(1)
37+
38+
data = response.json()
39+
if data.get("code") != 0:
40+
print(f"Error listing datasets: {data.get('message')}")
41+
sys.exit(1)
42+
43+
existing_datasets = {ds['name']: ds for ds in data.get("data", [])}
44+
45+
for name, description in UPDATES.items():
46+
if name in existing_datasets:
47+
ds = existing_datasets[name]
48+
print(f"Updating '{name}'...")
49+
50+
update_payload = {
51+
"name": name,
52+
"description": description,
53+
"permission": ds.get("permission", "me"),
54+
"avatar": ds.get("avatar", ""),
55+
# "tenant_id": ds.get("tenant_id") # Removing tenant_id as it causes "Extra inputs are not permitted"
56+
}
57+
58+
update_res = requests.put(f"{full_url}/datasets/{ds['id']}", headers=headers, json=update_payload)
59+
60+
if update_res.status_code == 200:
61+
res_data = update_res.json()
62+
if res_data.get("code") == 0:
63+
print(f" ✅ Success")
64+
else:
65+
print(f" ❌ Failed: {res_data.get('message')}")
66+
else:
67+
print(f" ❌ Failed: {update_res.status_code}")
68+
else:
69+
print(f"⚠️ Dataset '{name}' not found. Skipping.")
70+
71+
except Exception as e:
72+
print(f"Error: {e}")
73+
sys.exit(1)
74+
75+
if __name__ == "__main__":
76+
batch_update()
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import os
2+
import requests
3+
import json
4+
import sys
5+
from utils import load_config, get_base_url
6+
7+
def list_datasets():
8+
config = load_config()
9+
base_url = config.get("RAGFLOW_BASE_URL")
10+
api_key = config.get("RAGFLOW_API_KEY")
11+
12+
if not base_url or not api_key:
13+
print("Error: RAGFlow configuration missing. Please run 'python3 scripts/check_config.py' first.")
14+
sys.exit(1)
15+
16+
full_url = get_base_url(base_url)
17+
headers = {
18+
"Authorization": f"Bearer {api_key}",
19+
"Content-Type": "application/json"
20+
}
21+
22+
try:
23+
response = requests.get(f"{full_url}/datasets?page=1&page_size=100", headers=headers, timeout=10)
24+
if response.status_code == 200:
25+
data = response.json()
26+
if data.get("code") == 0:
27+
datasets = data.get("data", [])
28+
output = []
29+
for ds in datasets:
30+
output.append({
31+
"id": ds.get("id"),
32+
"name": ds.get("name"),
33+
"description": ds.get("description", ""),
34+
"permission": ds.get("permission", "me"),
35+
"doc_count": ds.get("doc_count", 0)
36+
})
37+
# Output JSON for the Agent to parse easily
38+
print(json.dumps(output, ensure_ascii=False, indent=2))
39+
else:
40+
print(f"Error: API returned code {data.get('code')}: {data.get('message')}")
41+
sys.exit(1)
42+
else:
43+
print(f"Error: Failed to connect (Status Code: {response.status_code})")
44+
sys.exit(1)
45+
except Exception as e:
46+
print(f"Error: {e}")
47+
sys.exit(1)
48+
49+
if __name__ == "__main__":
50+
list_datasets()
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import os
2+
import requests
3+
import argparse
4+
import sys
5+
import json
6+
from utils import load_config, get_base_url
7+
8+
def update_dataset_description():
9+
config = load_config()
10+
11+
parser = argparse.ArgumentParser(description="Update RAGFlow dataset description")
12+
parser.add_argument("--dataset_name", required=True, help="Name of the dataset to update")
13+
parser.add_argument("--description", required=True, help="New description")
14+
15+
args = parser.parse_args()
16+
target_name = args.dataset_name
17+
new_description = args.description
18+
19+
base_url = config.get("RAGFLOW_BASE_URL")
20+
api_key = config.get("RAGFLOW_API_KEY")
21+
22+
if not base_url or not api_key:
23+
print("Error: RAGFlow configuration missing. Please run 'python3 scripts/check_config.py' first.")
24+
sys.exit(1)
25+
26+
full_url = get_base_url(base_url)
27+
headers = {
28+
"Authorization": f"Bearer {api_key}",
29+
"Content-Type": "application/json"
30+
}
31+
32+
try:
33+
# 1. Find Dataset ID
34+
dataset_id = None
35+
target_ds_data = None
36+
37+
# Need to iterate because name filter might not be exact or multiple
38+
response = requests.get(f"{full_url}/datasets?page=1&page_size=100", headers=headers)
39+
40+
if response.status_code == 200:
41+
data = response.json()
42+
if data.get("code") == 0:
43+
datasets = data.get("data", [])
44+
for ds in datasets:
45+
if ds.get("name") == target_name:
46+
dataset_id = ds.get("id")
47+
target_ds_data = ds
48+
break
49+
else:
50+
print(f"Error listing datasets: {data.get('message')}")
51+
sys.exit(1)
52+
else:
53+
print(f"Error listing datasets: {response.status_code}")
54+
sys.exit(1)
55+
56+
if not dataset_id:
57+
print(f"Dataset '{target_name}' not found.")
58+
sys.exit(1)
59+
60+
# 2. Update Description
61+
# Construct payload with existing values to avoid overwriting them with defaults if PUT is full replace
62+
# Based on typical REST API, PUT usually replaces resource. PATCH updates partial.
63+
# RAGFlow API documentation (from snippets) suggests "Updates configurations...".
64+
# Let's try sending just description first. If it fails or clears other fields, we might need to send all.
65+
# Safest bet is to include what we know.
66+
67+
update_payload = {
68+
"name": target_name, # Keep name
69+
"description": new_description,
70+
"permission": target_ds_data.get("permission", "me"),
71+
"avatar": target_ds_data.get("avatar", ""),
72+
"tenant_id": target_ds_data.get("tenant_id")
73+
}
74+
75+
# Using PUT based on typical patterns, endpoint /datasets/{id}
76+
update_response = requests.put(f"{full_url}/datasets/{dataset_id}", headers=headers, json=update_payload)
77+
78+
if update_response.status_code == 200:
79+
res_data = update_response.json()
80+
if res_data.get("code") == 0:
81+
print(f"Successfully updated description for '{target_name}'")
82+
else:
83+
print(f"Error updating description: {res_data.get('message')}")
84+
else:
85+
print(f"Error updating description: {update_response.status_code} {update_response.text}")
86+
87+
except Exception as e:
88+
print(f"Error: {e}")
89+
sys.exit(1)
90+
91+
if __name__ == "__main__":
92+
update_dataset_description()

0 commit comments

Comments
 (0)