feat(ragflow-knowledge): 新增知识库管理脚本并更新文档

Geniusay · Geniusay · commit c4cb46e8a261 · 2026-03-16T14:04:40.000+08:00
新增 `list_datasets.py` 脚本，用于列出所有可用知识库以支持智能路由。
新增 `batch_update.py` 脚本，用于批量更新预定义知识库的描述。
新增 `update_description.py` 脚本，用于更新单个知识库的描述。
更新 SKILL.md 文档，新增“列出可用知识库”章节并优化存储流程说明。
diff --git a/skills/ragflow-knowledge/SKILL.md b/skills/ragflow-knowledge/SKILL.md
@@ -41,23 +41,41 @@ python3 skills/ragflow-knowledge/scripts/retrieve_knowledge.py --query "您的
 - **Query**: 要搜索的问题或主题。
 - **Chat ID**: （可选）使用的特定聊天 ID。
 
-### 3. 存储知识（保存）
+### 3. 列出可用知识库
+
+在决定将知识存入何处之前，先列出当前可用的知识库，以便根据内容主题选择最合适的目标。
+
+```bash
+python3 skills/ragflow-knowledge/scripts/list_datasets.py
+```
+
+### 4. 存储知识（保存）
 
 当用户明确要求将内容（文本、代码或文件）保存到知识库时使用。
 
+**智能路由策略**：
+1.  首先调用 `list_datasets.py` 获取所有知识库列表。
+2.  分析用户提供的知识内容（主题、类型、敏感度）。
+3.  将内容匹配到最合适的知识库（例如：技术文档 -> "Technical Docs", HR政策 -> "HR Policies"）。
+4.  调用 `save_knowledge.py` 并指定 `dataset_name` 或 `dataset_id`。
+5.  如果无法确定，可以询问用户或存入默认知识库（如 "General Knowledge"）。
+
 ```bash
-python3 skills/ragflow-knowledge/scripts/save_knowledge.py --content "要保存的内容" [--file_path "/文件/路径"] [--dataset_name "我的知识库"]
+python3 skills/ragflow-knowledge/scripts/save_knowledge.py --content "要保存的内容" [--file_path "/文件/路径"] [--dataset_name "目标知识库名称"]
 ```
 
 - **Content**: 直接保存的文本内容。
 - **File Path**: 要上传的本地文件路径。
-- **Dataset Name**: （可选）保存到的数据集名称。默认为 "TraeKnowledge"。
+- **Dataset Name**: （可选）保存到的数据集名称。如果不指定，脚本将尝试使用默认值或创建新库。
 
 ## 脚本详情
 
 ### `check_config.py`
 验证与 RAGFlow 的连接并列出可用的数据集/聊天。这有助于确保环境准备就绪。
 
+### `list_datasets.py`
+返回所有可用知识库的详细列表（JSON格式），包含ID、名称、描述和文档数量。Agent 应使用此输出来决定存储目标。
+
 ### `retrieve_knowledge.py`
 向 RAGFlow Chat API 发送查询并返回答案/上下文。
 
diff --git a/skills/ragflow-knowledge/scripts/batch_update.py b/skills/ragflow-knowledge/scripts/batch_update.py
@@ -0,0 +1,76 @@
+import os
+import requests
+import sys
+import json
+from utils import load_config, get_base_url
+
+# Define the updates
+UPDATES = {
+    "TML-技术文档": "存放团队核心技术沉淀，包括领域驱动设计(DDD)、架构设计模式、技术规范、技术方案及论文。",
+    "TML-团队规范": "团队协作标准与管理流程，包括代码编写规范、Git 提交规范、研发流程管理制度、考勤与行政规则。",
+    "TML-公共资源": "团队公共资产与基础设施信息，包括服务器资产列表、脱敏后的账号管理、常用软件工具包下载地址、常见问题解答(FAQ)。",
+    "TML-项目文档": "具体项目的全生命周期文档，包括需求规格说明书(PRD)、项目排期表、会议纪要、测试报告与验收文档。",
+    "TML-知识库": "非结构化或未分类的通用知识，包括行业动态、竞品情报、外部采集的参考资料及碎片化信息。"
+}
+
+def batch_update():
+    config = load_config()
+    base_url = config.get("RAGFLOW_BASE_URL")
+    api_key = config.get("RAGFLOW_API_KEY")
+    
+    if not base_url or not api_key:
+        print("Error: RAGFlow configuration missing.")
+        sys.exit(1)
+
+    full_url = get_base_url(base_url)
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+
+    print("Fetching existing datasets...")
+    try:
+        response = requests.get(f"{full_url}/datasets?page=1&page_size=100", headers=headers)
+        if response.status_code != 200:
+            print(f"Error listing datasets: {response.status_code}")
+            sys.exit(1)
+            
+        data = response.json()
+        if data.get("code") != 0:
+            print(f"Error listing datasets: {data.get('message')}")
+            sys.exit(1)
+            
+        existing_datasets = {ds['name']: ds for ds in data.get("data", [])}
+        
+        for name, description in UPDATES.items():
+            if name in existing_datasets:
+                ds = existing_datasets[name]
+                print(f"Updating '{name}'...")
+                
+                update_payload = {
+                    "name": name,
+                    "description": description,
+                    "permission": ds.get("permission", "me"),
+                    "avatar": ds.get("avatar", ""),
+                    # "tenant_id": ds.get("tenant_id") # Removing tenant_id as it causes "Extra inputs are not permitted"
+                }
+                
+                update_res = requests.put(f"{full_url}/datasets/{ds['id']}", headers=headers, json=update_payload)
+                
+                if update_res.status_code == 200:
+                    res_data = update_res.json()
+                    if res_data.get("code") == 0:
+                        print(f"  ✅ Success")
+                    else:
+                        print(f"  ❌ Failed: {res_data.get('message')}")
+                else:
+                    print(f"  ❌ Failed: {update_res.status_code}")
+            else:
+                print(f"⚠️ Dataset '{name}' not found. Skipping.")
+
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    batch_update()
diff --git a/skills/ragflow-knowledge/scripts/list_datasets.py b/skills/ragflow-knowledge/scripts/list_datasets.py
@@ -0,0 +1,50 @@
+import os
+import requests
+import json
+import sys
+from utils import load_config, get_base_url
+
+def list_datasets():
+    config = load_config()
+    base_url = config.get("RAGFLOW_BASE_URL")
+    api_key = config.get("RAGFLOW_API_KEY")
+    
+    if not base_url or not api_key:
+        print("Error: RAGFlow configuration missing. Please run 'python3 scripts/check_config.py' first.")
+        sys.exit(1)
+
+    full_url = get_base_url(base_url)
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+    
+    try:
+        response = requests.get(f"{full_url}/datasets?page=1&page_size=100", headers=headers, timeout=10)
+        if response.status_code == 200:
+            data = response.json()
+            if data.get("code") == 0:
+                datasets = data.get("data", [])
+                output = []
+                for ds in datasets:
+                    output.append({
+                        "id": ds.get("id"),
+                        "name": ds.get("name"),
+                        "description": ds.get("description", ""),
+                        "permission": ds.get("permission", "me"),
+                        "doc_count": ds.get("doc_count", 0)
+                    })
+                # Output JSON for the Agent to parse easily
+                print(json.dumps(output, ensure_ascii=False, indent=2))
+            else:
+                print(f"Error: API returned code {data.get('code')}: {data.get('message')}")
+                sys.exit(1)
+        else:
+            print(f"Error: Failed to connect (Status Code: {response.status_code})")
+            sys.exit(1)
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    list_datasets()
diff --git a/skills/ragflow-knowledge/scripts/update_description.py b/skills/ragflow-knowledge/scripts/update_description.py
@@ -0,0 +1,92 @@
+import os
+import requests
+import argparse
+import sys
+import json
+from utils import load_config, get_base_url
+
+def update_dataset_description():
+    config = load_config()
+    
+    parser = argparse.ArgumentParser(description="Update RAGFlow dataset description")
+    parser.add_argument("--dataset_name", required=True, help="Name of the dataset to update")
+    parser.add_argument("--description", required=True, help="New description")
+    
+    args = parser.parse_args()
+    target_name = args.dataset_name
+    new_description = args.description
+    
+    base_url = config.get("RAGFLOW_BASE_URL")
+    api_key = config.get("RAGFLOW_API_KEY")
+    
+    if not base_url or not api_key:
+        print("Error: RAGFlow configuration missing. Please run 'python3 scripts/check_config.py' first.")
+        sys.exit(1)
+
+    full_url = get_base_url(base_url)
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json"
+    }
+
+    try:
+        # 1. Find Dataset ID
+        dataset_id = None
+        target_ds_data = None
+        
+        # Need to iterate because name filter might not be exact or multiple
+        response = requests.get(f"{full_url}/datasets?page=1&page_size=100", headers=headers)
+        
+        if response.status_code == 200:
+            data = response.json()
+            if data.get("code") == 0:
+                datasets = data.get("data", [])
+                for ds in datasets:
+                    if ds.get("name") == target_name:
+                        dataset_id = ds.get("id")
+                        target_ds_data = ds
+                        break
+            else:
+                print(f"Error listing datasets: {data.get('message')}")
+                sys.exit(1)
+        else:
+            print(f"Error listing datasets: {response.status_code}")
+            sys.exit(1)
+            
+        if not dataset_id:
+            print(f"Dataset '{target_name}' not found.")
+            sys.exit(1)
+
+        # 2. Update Description
+        # Construct payload with existing values to avoid overwriting them with defaults if PUT is full replace
+        # Based on typical REST API, PUT usually replaces resource. PATCH updates partial.
+        # RAGFlow API documentation (from snippets) suggests "Updates configurations...".
+        # Let's try sending just description first. If it fails or clears other fields, we might need to send all.
+        # Safest bet is to include what we know.
+        
+        update_payload = {
+            "name": target_name, # Keep name
+            "description": new_description,
+            "permission": target_ds_data.get("permission", "me"),
+            "avatar": target_ds_data.get("avatar", ""),
+            "tenant_id": target_ds_data.get("tenant_id")
+        }
+        
+        # Using PUT based on typical patterns, endpoint /datasets/{id}
+        update_response = requests.put(f"{full_url}/datasets/{dataset_id}", headers=headers, json=update_payload)
+        
+        if update_response.status_code == 200:
+            res_data = update_response.json()
+            if res_data.get("code") == 0:
+                print(f"Successfully updated description for '{target_name}'")
+            else:
+                print(f"Error updating description: {res_data.get('message')}")
+        else:
+            print(f"Error updating description: {update_response.status_code} {update_response.text}")
+
+    except Exception as e:
+        print(f"Error: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    update_dataset_description()