Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 1 addition & 17 deletions crates/ov_cli/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,25 +61,14 @@ impl HttpClient {
FileOptions::default().compression_method(CompressionMethod::Deflated);

let walkdir = walkdir::WalkDir::new(dir_path);
let base_name = dir_path.file_name().and_then(|n| n.to_str()).ok_or_else(|| {
Error::InvalidPath(format!(
"Non-UTF-8 directory name: {}",
dir_path.to_string_lossy()
))
})?;
for entry in walkdir.into_iter().filter_map(|e| e.ok()) {
let path = entry.path();
if path.is_file() {
let name = path.strip_prefix(dir_path).unwrap_or(path);
let name_str = name.to_str().ok_or_else(|| {
Error::InvalidPath(format!("Non-UTF-8 path: {}", name.to_string_lossy()))
})?;
let zip_name = if name_str.is_empty() {
base_name.to_string()
} else {
format!("{}/{}", base_name, name_str)
};
zip.start_file(zip_name, options)?;
zip.start_file(name_str, options)?;
let mut file = File::open(path)?;
std::io::copy(&mut file, &mut zip)?;
}
Expand Down Expand Up @@ -607,15 +596,10 @@ impl HttpClient {

self.post("/api/v1/resources", &body).await
} else if path_obj.is_file() {
let source_name = path_obj
.file_stem()
.and_then(|n| n.to_str())
.map(|s| s.to_string());
let temp_file_id = self.upload_temp_file(path_obj).await?;

let body = serde_json::json!({
"temp_file_id": temp_file_id,
"source_name": source_name,
"to": to,
"parent": parent,
"reason": reason,
Expand Down
5 changes: 1 addition & 4 deletions crates/ov_cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,7 @@ enum Commands {
AddResource {
/// Local path or URL to import
path: String,
/// Target URI (cannot be used with --parent).
/// - If ends with '/': treated as a directory; preserves original file/dir name under it.
/// - Otherwise: treated as exact root URI (file path or directory root).
/// Note: 'viking://resources' (no trailing slash) is not allowed; use 'viking://resources/'.
/// Exact target URI (must not exist yet) (cannot be used with --parent)
#[arg(long)]
to: Option<String>,
/// Target parent URI (must already exist and be a directory) (cannot be used with --to)
Expand Down
21 changes: 5 additions & 16 deletions docs/en/api/02-resources.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,23 +42,12 @@ Add a resource to the knowledge base.
|-----------|------|----------|---------|-------------|
| path | str | Yes | - | SDK/CLI: local path, directory path, or URL. Raw HTTP: remote URL only |
| temp_file_id | str | No | None | Upload ID returned by `POST /api/v1/resources/temp_upload` for raw HTTP local file ingestion |
| to | str | No | None | Target Viking URI (must be in `resources` scope; cannot be used with `parent`) |
| parent | str | No | None | Target parent URI (must exist and be a directory; cannot be used with `to`) |
| target | str | No | None | Target Viking URI (must be in `resources` scope) |
| reason | str | No | "" | Why this resource is being added (improves search relevance) |
| instruction | str | No | "" | Special processing instructions |
| wait | bool | No | False | Wait for semantic processing to complete |
| timeout | float | No | None | Timeout in seconds (only used when wait=True) |
| watch_interval | float | No | 0 | Watch interval (minutes). >0 enables/updates watch; <=0 disables watch. Only takes effect when `to` is provided |

**Trailing slash semantics of `to` (applies after the resource is fetched; archives are extracted first)**

- If the resource is a **file**:
- `to` ends with `/`: treat `to` as a directory; final location is `to/<source filename>`
- `to` does not end with `/`: treat `to` as a file; final location is `to`
- If the resource is a **directory**:
- `to` ends with `/`: treat `to` as a directory; final location is `to/<source directory name>/`
- `to` does not end with `/`: treat `to` as a directory; map the directory contents into `to` (no extra directory layer)
- If `to == viking://resources` and the request would map a file to a file or map directory contents into `viking://resources` (no trailing slash), the server returns an error and asks you to change `to` (e.g. add `/` or use a more specific path).
| watch_interval | float | No | 0 | Watch interval (minutes). >0 enables/updates watch; <=0 disables watch. Only takes effect when target is provided |

**How local files and directories work**

Expand All @@ -73,7 +62,7 @@ Add a resource to the knowledge base.

When you call `add_resource()` repeatedly for the same resource URI, the system performs an incremental update instead of rebuilding everything from scratch:

- **Trigger**: `to` is provided and already exists in the knowledge base.
- **Trigger**: `target` is provided and already exists in the knowledge base.
- **High-level idea**: each ingestion first builds a temporary resource tree from the new input. During asynchronous semantic processing, the temporary tree is compared against the existing tree at `target`, and only the changed parts are re-processed and synchronized.
- **Incremental behavior in the semantic stage**:
- **Unchanged files**: reuse existing L0 summaries and vector index records; skip vectorization.
Expand Down Expand Up @@ -137,7 +126,7 @@ openviking add-resource ./documents/guide.md --reason "User guide documentation"
```python
result = client.add_resource(
"https://example.com/api-docs.md",
to="viking://resources/external/",
target="viking://resources/external/",
reason="External API documentation"
)
client.wait_processed()
Expand All @@ -151,7 +140,7 @@ curl -X POST http://localhost:1933/api/v1/resources \
-H "X-API-Key: your-key" \
-d '{
"path": "https://example.com/api-docs.md",
"to": "viking://resources/external/",
"target": "viking://resources/external/",
"reason": "External API documentation",
"wait": true
}'
Expand Down
21 changes: 5 additions & 16 deletions docs/zh/api/02-resources.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,23 +42,12 @@ Input -> Parser -> TreeBuilder -> AGFS -> SemanticQueue -> Vector Index
|------|------|------|--------|------|
| path | str | 是 | - | SDK/CLI 可传本地路径、目录路径或 URL;裸 HTTP 仅支持远端 URL |
| temp_file_id | str | 否 | None | `POST /api/v1/resources/temp_upload` 返回的上传 ID,用于裸 HTTP 导入本地文件 |
| to | str | 否 | None | 目标 Viking URI(必须在 `resources` 作用域内;不可与 `parent` 同时使用) |
| parent | str | 否 | None | 目标父目录 URI(必须存在且为目录;不可与 `to` 同时使用) |
| target | str | 否 | None | 目标 Viking URI(必须在 `resources` 作用域内) |
| reason | str | 否 | "" | 添加该资源的原因(可提升搜索相关性) |
| instruction | str | 否 | "" | 特殊处理指令 |
| wait | bool | 否 | False | 等待语义处理完成 |
| timeout | float | 否 | None | 超时时间(秒),仅在 wait=True 时生效 |
| watch_interval | float | 否 | 0 | 定时更新间隔(分钟)。>0 开启/更新定时任务;<=0 关闭(停用)定时任务。仅在指定 `to` 时生效 |

**`to` 的尾斜杠语义(资源获取完成后适用;压缩内容先解压)**

- 资源为**文件**:
- `to` 以 `/` 结尾:`to` 视为目录,最终落点为 `to/<源文件名>`
- `to` 不以 `/` 结尾:`to` 视为文件,最终落点为 `to`
- 资源为**目录**:
- `to` 以 `/` 结尾:`to` 视为目录,最终落点为 `to/<源目录名>/`
- `to` 不以 `/` 结尾:`to` 视为目录,目录内容映射到 `to`(不额外新增一层目录)
- 当 `to == viking://resources` 且命中“文件映射到文件”或“目录内容映射到目录(无尾斜杠)”语义时,服务端会直接报错并提示修改 `to`(例如加 `/` 或指定更具体路径)。
| watch_interval | float | 否 | 0 | 定时更新间隔(分钟)。>0 开启/更新定时任务;<=0 关闭(停用)定时任务。仅在指定 target 时生效 |

**本地文件和目录如何处理**

Expand All @@ -73,7 +62,7 @@ Input -> Parser -> TreeBuilder -> AGFS -> SemanticQueue -> Vector Index

当你为同一个资源 URI 反复调用 `add_resource()` 时,系统会走“增量更新”而不是每次全量重建:

- **触发条件**:请求里显式指定 `to`,且该 `to` 在知识库中已存在。
- **触发条件**:请求里显式指定 `target`,且该 `target` 在知识库中已存在。
- **总体思路**:每次导入都会先把新内容解析/构建成一棵“临时资源树”,随后在异步语义处理阶段,将临时树与 `target` 对应的现有资源树进行对比,只对发生变化的部分做重算与同步。
- **语义阶段的增量**:
- 对**未变化的文件**:复用已有 L0(摘要)与向量索引记录,跳过向量化。
Expand Down Expand Up @@ -137,7 +126,7 @@ openviking add-resource ./documents/guide.md --reason "User guide documentation"
```python
result = client.add_resource(
"https://example.com/api-docs.md",
to="viking://resources/external/",
target="viking://resources/external/",
reason="External API documentation"
)
client.wait_processed()
Expand All @@ -151,7 +140,7 @@ curl -X POST http://localhost:1933/api/v1/resources \
-H "X-API-Key: your-key" \
-d '{
"path": "https://example.com/api-docs.md",
"to": "viking://resources/external/",
"target": "viking://resources/external/",
"reason": "External API documentation",
"wait": true
}'
Expand Down
12 changes: 3 additions & 9 deletions openviking/parse/parsers/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,10 +174,9 @@ async def parse_content(
await viking_fs.mkdir(temp_uri)
logger.debug(f"[MarkdownParser] Created temp directory: {temp_uri}")

name_hint = kwargs.get("resource_name") or kwargs.get("source_name")
# Get document title
doc_title = meta.get("frontmatter", {}).get(
"title", name_hint or (Path(source_path).stem if source_path else "Document")
"title", Path(source_path).stem if source_path else "Document"
)

# Create root directory
Expand All @@ -188,9 +187,7 @@ async def parse_content(
logger.info(f"[MarkdownParser] Found {len(headings)} headings")

# Parse and create directory structure
await self._parse_and_create_structure(
content, headings, root_dir, source_path, doc_name_override=name_hint
)
await self._parse_and_create_structure(content, headings, root_dir, source_path)

parse_time = time.time() - start_time
logger.info(f"[MarkdownParser] Parse completed in {parse_time:.2f}s")
Expand Down Expand Up @@ -368,7 +365,6 @@ async def _parse_and_create_structure(
headings: List[Tuple[int, int, str, int]],
root_dir: str,
source_path: Optional[str] = None,
doc_name_override: Optional[str] = None,
) -> None:
"""
Parse markdown and create directory structure directly in VikingFS.
Expand Down Expand Up @@ -399,9 +395,7 @@ async def _parse_and_create_structure(
await viking_fs.mkdir(root_dir)

# Get document name
doc_name = self._sanitize_for_path(
doc_name_override or (Path(source_path).stem if source_path else "content")
)
doc_name = self._sanitize_for_path(Path(source_path).stem if source_path else "content")

# Small document: save as single file (check both token and char limits)
if estimated_tokens <= max_size and len(content) <= max_chars:
Expand Down
65 changes: 7 additions & 58 deletions openviking/parse/tree_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
from openviking.server.identity import RequestContext
from openviking.storage.viking_fs import get_viking_fs
from openviking.utils import parse_code_hosting_url
from openviking_cli.exceptions import InvalidArgumentError
from openviking_cli.utils.uri import VikingURI

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -154,59 +153,9 @@ async def finalize_from_temp(
# 2. Determine base_uri and final document name with org/repo for GitHub/GitLab
auto_base_uri = self._get_base_uri(scope, source_path, source_format)
base_uri = parent_uri or auto_base_uri
temp_artifact_uri = temp_doc_uri
artifact_kind = "dir"

# 3. Determine candidate_uri / final_uri
# 3. Determine candidate_uri
if to_uri:
candidate_uri = VikingURI.normalize(to_uri)
to_is_dir_semantics = candidate_uri.endswith("/")
to_is_protected_resources_root = candidate_uri == "viking://resources"

entries = await viking_fs.ls(temp_doc_uri, ctx=ctx)
visible_entries = [
e
for e in entries
if e.get("name") not in (".", "..") and not str(e.get("name", "")).startswith(".")
]
force_directory_resource = source_format in ("directory", "repository")
is_single_file_resource = (
not force_directory_resource
and len(visible_entries) == 1
and not visible_entries[0].get("isDir", False)
)

if is_single_file_resource:
artifact_kind = "file"
source_filename = str(visible_entries[0].get("name", "")).strip()
if not source_filename:
raise ValueError(f"[TreeBuilder] Empty filename in {temp_doc_uri}")
temp_artifact_uri = f"{temp_doc_uri.rstrip('/')}/{source_filename}"

if to_is_dir_semantics:
final_uri = VikingURI(candidate_uri).join(source_filename).uri
else:
if to_is_protected_resources_root:
raise InvalidArgumentError(
"`to` 不允许为 viking://resources(无尾斜杠)用于文件资源。"
"请修改 to:例如使用 viking://resources/ 以保留原文件名,"
"或使用 viking://resources/<目录>/ 或 viking://resources/<目录>/<文件名>。"
)
final_uri = candidate_uri
else:
artifact_kind = "dir"
temp_artifact_uri = temp_doc_uri

if to_is_dir_semantics:
final_uri = VikingURI(candidate_uri).join(final_doc_name).uri.rstrip("/") + "/"
else:
if to_is_protected_resources_root:
raise InvalidArgumentError(
"`to` 不允许为 viking://resources(无尾斜杠)用于目录资源。"
"请修改 to:例如使用 viking://resources/ 以保留目录名,"
"或使用 viking://resources/<目录名>/ 或 viking://resources/<目标目录>。"
)
final_uri = candidate_uri
candidate_uri = to_uri
else:
if parent_uri:
# Parent URI must exist and be a directory
Expand All @@ -217,6 +166,10 @@ async def finalize_from_temp(
if not stat_result.get("isDir"):
raise ValueError(f"Parent URI is not a directory: {parent_uri}")
candidate_uri = VikingURI(base_uri).join(final_doc_name).uri

if to_uri:
final_uri = candidate_uri
else:
final_uri = await self._resolve_unique_uri(candidate_uri)

tree = BuildingTree(
Expand All @@ -228,11 +181,7 @@ async def finalize_from_temp(
tree._candidate_uri = candidate_uri

# Create a minimal Context object for the root so that tree.root is not None
root_context = Context(
uri=final_uri,
temp_uri=temp_artifact_uri,
meta={"artifact_kind": artifact_kind},
)
root_context = Context(uri=final_uri, temp_uri=temp_doc_uri)
tree.add_context(root_context)

return tree
22 changes: 2 additions & 20 deletions openviking/utils/media_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,31 +167,13 @@ async def _process_file(
try:
with zipfile.ZipFile(file_path, "r") as zipf:
safe_extract_zip(zipf, temp_dir)
visible = [
p
for p in temp_dir.iterdir()
if p.name not in {".", ".."} and not p.name.startswith(".")
]
dirs = [p for p in visible if p.is_dir()]
files = [p for p in visible if p.is_file()]

if len(dirs) == 1 and not files:
inferred_root = dirs[0]
kwargs_for_dir = dict(kwargs)
kwargs_for_dir.pop("source_name", None)
return await self._process_directory(inferred_root, instruction, **kwargs_for_dir)
if len(files) == 1 and not dirs:
return await self._process_file(files[0], instruction, **kwargs)

kwargs_for_dir = dict(kwargs)
kwargs_for_dir["source_name"] = kwargs_for_dir.get("source_name") or file_path.stem
return await self._process_directory(temp_dir, instruction, **kwargs_for_dir)
return await self._process_directory(temp_dir, instruction, **kwargs)
finally:
pass # Don't delete temp_dir yet, it will be used by TreeBuilder
return await parse(
str(file_path),
instruction=instruction,
vlm_processor=self._get_vlm_processor(),
storage=self.storage,
resource_name=kwargs.get("source_name") or kwargs.get("resource_name") or file_path.stem,
resource_name=file_path.stem,
)
Loading
Loading