diff --git a/examples/openclaw-migration/README.md b/examples/openclaw-migration/README.md new file mode 100644 index 000000000..4bd40b3e5 --- /dev/null +++ b/examples/openclaw-migration/README.md @@ -0,0 +1,117 @@ +# OpenClaw → OpenViking Memory Migration + +Imports your existing [OpenClaw](https://github.com/openclaw-ai/openclaw) memory files directly into OpenViking's memory system. + +**Zero LLM calls.** Content is preserved verbatim; only embeddings are generated. + +--- + +## What gets migrated + +| OpenClaw file | OpenViking category | Why | +|---|---|---| +| `MEMORY.md`, `memory.md` | `entities` | Curated durable knowledge — projects, people, concepts | +| `YYYY-MM-DD.md` (daily logs) | `events` | Time-stamped records, decisions, milestones | +| `YYYY-MM-DD-slug.md` (session summaries) | `cases` | Problem + solution context from specific sessions | +| Everything else | `entities` | Safe fallback for arbitrary markdown files | + +--- + +## Requirements + +```bash +pip install openviking +``` + +A valid `~/.openviking/ov.conf` (or equivalent) with an embedding model configured is required for the real run. + +--- + +## Usage + +```bash +# Preview — no data written +python migrate.py --dry-run + +# Migrate with defaults +# OpenClaw dir : ~/.openclaw/workspace +# OV data dir : ./data +# identity : account=default, user=default, agent=default +python migrate.py + +# Custom paths and identity +python migrate.py \ + --openclaw-dir ~/myworkspace \ + --ov-data-dir ./ov-data \ + --account-id myaccount \ + --user-id myuser \ + --agent-id myagent + +# Force all files into a single category +python migrate.py --category events +``` + +### Options + +| Flag | Default | Description | +|---|---|---| +| `--openclaw-dir PATH` | `~/.openclaw/workspace` | Path to OpenClaw workspace | +| `--ov-data-dir PATH` | `./data` | OpenViking data directory | +| `--account-id TEXT` | `default` | Account ID (alphanumeric/underscore/hyphen) | +| `--user-id TEXT` | `default` | User ID | +| `--agent-id TEXT` | `default` | Agent ID | +| `--category TEXT` | *(auto)* | Override category for all files | +| `--dry-run` | off | Preview without writing | + +--- + +## Dry-run example + +``` +OpenClaw → OpenViking Migration (DRY RUN) +Found 3 file(s) in /Users/alice/.openclaw/workspace + + MEMORY.md → entities (3,421 chars) + memory/2026-03-15.md → events (1,832 chars) + memory/2026-03-15-bug-fix.md → cases (942 chars) + +Would import: 3 file(s) | 0 LLM calls | ~3 embedding job(s) queued | 6,195 chars total +Run without --dry-run to proceed. +``` + +--- + +## How it works + +For each file the script: + +1. Reads the Markdown content. +2. Classifies it into an OV memory category (or uses `--category`). +3. Builds a one-line `abstract` (first non-empty line, ≤ 100 chars). +4. Calls `MemoryExtractor.create_memory()` — writes the file to VikingFS. +5. Calls `SessionCompressor._index_memory()` — enqueues an embedding job. + +Memories appear in OV's memory-specific retrieval (`viking://user//memories/`) as soon as the embedding worker processes the queue. + +--- + +## Running tests + +```bash +PYTHONPATH=. .venv/bin/python -m pytest tests/unit/test_openclaw_migration.py -v --no-cov +``` + +--- + +## Verifying the import + +After a real run, confirm memories were written: + +```python +import openviking as ov + +client = ov.OpenViking(path="./data") +client.initialize() +results = client.find("viking://user/default/memories/entities/") +print(results) +``` diff --git a/examples/openclaw-migration/migrate.py b/examples/openclaw-migration/migrate.py new file mode 100644 index 000000000..6d7e8a8ac --- /dev/null +++ b/examples/openclaw-migration/migrate.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +""" +OpenClaw → OpenViking Memory Migration Tool + +Imports plain-Markdown memory files from an OpenClaw workspace directly into +OpenViking's memory system using MemoryExtractor.create_memory() + +SessionCompressor._index_memory(). Zero LLM calls — only file reads, +memory writes, and embedding jobs enqueued. + +Usage: + python migrate.py [OPTIONS] + +Examples: + # Dry run — preview without writing + python migrate.py --dry-run + + # Real migration with defaults + python migrate.py + + # Custom paths + python migrate.py \\ + --openclaw-dir ~/myworkspace \\ + --ov-data-dir ./ov-data \\ + --user-id myuser +""" + +from __future__ import annotations + +import asyncio +import re +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import List, NamedTuple, Optional + +# --------------------------------------------------------------------------- +# CLI argument parsing (stdlib only — no extra deps) +# --------------------------------------------------------------------------- + +import argparse + + +def _parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Migrate OpenClaw memory files into OpenViking.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--openclaw-dir", + default=str(Path.home() / ".openclaw" / "workspace"), + help="Path to OpenClaw workspace (default: ~/.openclaw/workspace)", + ) + parser.add_argument( + "--ov-data-dir", + default="./data", + help="OpenViking data directory (default: ./data)", + ) + parser.add_argument( + "--account-id", + default="default", + help="Account ID (default: default)", + ) + parser.add_argument( + "--user-id", + default="default", + help="User ID (default: default)", + ) + parser.add_argument( + "--agent-id", + default="default", + help="Agent ID (default: default)", + ) + parser.add_argument( + "--category", + default=None, + choices=["entities", "events", "cases", "preferences"], + help="Override category for ALL files (skips auto-classification)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview what would be imported without writing anything", + ) + return parser.parse_args() + + +# --------------------------------------------------------------------------- +# File classification +# --------------------------------------------------------------------------- + +_DAILY_LOG_RE = re.compile(r"^\d{4}-\d{2}-\d{2}\.md$") +_SESSION_SUMMARY_RE = re.compile(r"^\d{4}-\d{2}-\d{2}-.+\.md$") + + +def classify_file(path: Path, category_override: Optional[str] = None) -> str: + """Return the OpenViking MemoryCategory value for an OpenClaw file. + + Mapping: + MEMORY.md / memory.md → entities + YYYY-MM-DD.md → events + YYYY-MM-DD-slug.md → cases + anything else → entities (safe fallback) + + A non-None *category_override* takes precedence over all rules. + """ + if category_override is not None: + return category_override + + name = path.name + if name.lower() in ("memory.md",): + return "entities" + if _DAILY_LOG_RE.match(name): + return "events" + if _SESSION_SUMMARY_RE.match(name): + return "cases" + return "entities" + + +# --------------------------------------------------------------------------- +# Content helpers +# --------------------------------------------------------------------------- + +def build_abstract(content: str) -> str: + """Return a one-line abstract for a memory file. + + Strategy: + 1. First non-empty line, truncated to 100 chars. + 2. If every leading line is blank/whitespace, fall back to the first + 100 chars of the raw content (stripped). + """ + for line in content.splitlines(): + stripped = line.strip() + if stripped: + return stripped[:100] + # All lines blank — use raw content prefix + return content.strip()[:100] + + +def build_overview(content: str) -> str: + """Return a medium-detail overview (first 5 non-empty lines, max 500 chars).""" + lines: List[str] = [] + for line in content.splitlines(): + if line.strip(): + lines.append(line) + if len(lines) >= 5: + break + overview = "\n".join(lines) + return overview[:500] + + +# --------------------------------------------------------------------------- +# File discovery +# --------------------------------------------------------------------------- + +class MemFile(NamedTuple): + path: Path + category: str + content: str + + +def discover_files(openclaw_dir: Path, category_override: Optional[str]) -> List[MemFile]: + """Walk the OpenClaw workspace and return all importable .md files.""" + if not openclaw_dir.exists(): + return [] + + candidates: List[MemFile] = [] + + # Root-level MEMORY.md / memory.md + for name in ("MEMORY.md", "memory.md"): + p = openclaw_dir / name + if p.is_file(): + content = p.read_text(encoding="utf-8", errors="replace") + candidates.append( + MemFile( + path=p, + category=classify_file(p, category_override), + content=content, + ) + ) + + # memory/ sub-directory + mem_dir = openclaw_dir / "memory" + if mem_dir.is_dir(): + for p in sorted(mem_dir.rglob("*.md")): + if p.is_file(): + content = p.read_text(encoding="utf-8", errors="replace") + candidates.append( + MemFile( + path=p, + category=classify_file(p, category_override), + content=content, + ) + ) + + return candidates + + +# --------------------------------------------------------------------------- +# Dry-run display +# --------------------------------------------------------------------------- + +def _display_name(file: MemFile, openclaw_dir: Path) -> str: + try: + return str(file.path.relative_to(openclaw_dir)) + except ValueError: + return file.path.name + + +def print_dry_run(files: List[MemFile], openclaw_dir: Path) -> None: + print() + print("OpenClaw → OpenViking Migration (DRY RUN)") + print(f"Found {len(files)} file(s) in {openclaw_dir}") + print() + + if not files: + print(" (no files found — nothing to import)") + return + + col_w = max(len(_display_name(f, openclaw_dir)) for f in files) + for f in files: + name = _display_name(f, openclaw_dir) + chars = len(f.content) + print(f" {name:<{col_w}} → {f.category:<12} ({chars:,} chars)") + + total_chars = sum(len(f.content) for f in files) + print() + print( + f"Would import: {len(files)} file(s) | 0 LLM calls | " + f"~{len(files)} embedding job(s) queued | {total_chars:,} chars total" + ) + print("Run without --dry-run to proceed.") + print() + + +# --------------------------------------------------------------------------- +# Async migration core +# --------------------------------------------------------------------------- + +async def _migrate_async( + files: List[MemFile], + openclaw_dir: Path, + ov_data_dir: str, + account_id: str, + user_id: str, + agent_id: str, +) -> None: + """Initialize OpenViking and write each file as a memory.""" + + # Late imports so the script can be imported without OV installed + # (e.g. unit tests for classify_file / build_abstract). + try: + import openviking as ov + from openviking.server.identity import RequestContext, Role + from openviking.session.compressor import SessionCompressor + from openviking.session.memory_extractor import ( + CandidateMemory, + MemoryCategory, + MemoryExtractor, + ) + from openviking_cli.session.user_id import UserIdentifier + except ImportError as exc: + print(f"ERROR: Could not import OpenViking — is it installed? ({exc})") + sys.exit(1) + + # -- Boot embedded OV --------------------------------------------------- + print(f"Initializing OpenViking at {ov_data_dir!r} …") + client = ov.OpenViking(path=ov_data_dir) + client.initialize() + print(" OpenViking initialized.") + + # -- Build identity & context ------------------------------------------- + user = UserIdentifier(account_id, user_id, agent_id) + ctx = RequestContext(user=user, role=Role.ROOT) + + session_id = f"openclaw-migration-{int(datetime.now(timezone.utc).timestamp())}" + + # -- Wire compressor with the VikingDB already initialised by OV -------- + # Access path: SyncOpenViking → AsyncOpenViking._service (property) → OpenVikingService + vikingdb = client._async_client._service.vikingdb_manager + if vikingdb is None: + print("ERROR: VikingDBManager not available after initialize(). Aborting.") + sys.exit(1) + + extractor = MemoryExtractor() + compressor = SessionCompressor(vikingdb=vikingdb) + + # -- Migrate each file --------------------------------------------------- + ok = 0 + failed = 0 + + for f in files: + display = _display_name(f, openclaw_dir) + try: + category = MemoryCategory(f.category) + except ValueError: + print(f" SKIP {display} (unknown category {f.category!r})") + failed += 1 + continue + + candidate = CandidateMemory( + category=category, + abstract=build_abstract(f.content), + overview=build_overview(f.content), + content=f.content, + source_session=session_id, + user=str(user), + language="auto", + ) + + memory = await extractor.create_memory( + candidate, str(user._user_id), session_id, ctx + ) + if memory is None: + print(f" FAIL {display} (create_memory returned None)") + failed += 1 + continue + + indexed = await compressor._index_memory(memory, ctx, change_type="added") + if indexed: + print(f" OK {display} → {f.category}") + ok += 1 + else: + print(f" WARN {display} (indexed={indexed})") + ok += 1 + + print() + print(f"Done: {ok} imported, {failed} failed.") + + try: + client.close() + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main() -> None: + args = _parse_args() + + openclaw_dir = Path(args.openclaw_dir).expanduser().resolve() + files = discover_files(openclaw_dir, args.category) + + if args.dry_run: + print_dry_run(files, openclaw_dir) + return + + if not files: + print(f"No .md files found in {openclaw_dir}. Nothing to import.") + return + + asyncio.run( + _migrate_async( + files=files, + openclaw_dir=openclaw_dir, + ov_data_dir=args.ov_data_dir, + account_id=args.account_id, + user_id=args.user_id, + agent_id=args.agent_id, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/tests/unit/test_openclaw_migration.py b/tests/unit/test_openclaw_migration.py new file mode 100644 index 000000000..e3b84e563 --- /dev/null +++ b/tests/unit/test_openclaw_migration.py @@ -0,0 +1,241 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: Apache-2.0 +"""Unit tests for the OpenClaw → OpenViking migration tool. + +These tests exercise pure-Python helpers (file classifier, abstract builder) +without requiring an OpenViking server or any network calls. +""" + +import sys +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +# --------------------------------------------------------------------------- +# The migration module lives under examples/ which is not on sys.path by +# default. We add it here so the import works without installing anything. +# --------------------------------------------------------------------------- +_EXAMPLES_DIR = Path(__file__).parent.parent.parent / "examples" / "openclaw-migration" +if str(_EXAMPLES_DIR) not in sys.path: + sys.path.insert(0, str(_EXAMPLES_DIR)) + +from migrate import ( # noqa: E402 + MemFile, + build_abstract, + build_overview, + classify_file, + discover_files, +) + + +# =========================================================================== +# File classification +# =========================================================================== + + +class TestClassifyFile: + def test_classify_memory_md_upper(self, tmp_path): + """MEMORY.md maps to entities.""" + p = tmp_path / "MEMORY.md" + p.touch() + assert classify_file(p) == "entities" + + def test_classify_memory_md_lower(self, tmp_path): + """memory.md (lowercase) maps to entities.""" + p = tmp_path / "memory.md" + p.touch() + assert classify_file(p) == "entities" + + def test_classify_daily_log(self, tmp_path): + """YYYY-MM-DD.md maps to events.""" + p = tmp_path / "2026-03-15.md" + p.touch() + assert classify_file(p) == "events" + + def test_classify_daily_log_various_dates(self, tmp_path): + """Multiple valid date filenames all map to events.""" + dates = ["2024-01-01.md", "2025-12-31.md", "2026-04-07.md"] + for name in dates: + p = tmp_path / name + p.touch() + assert classify_file(p) == "events", f"Expected events for {name}" + + def test_classify_session_summary(self, tmp_path): + """YYYY-MM-DD-slug.md maps to cases.""" + p = tmp_path / "2026-03-15-bug-fix.md" + p.touch() + assert classify_file(p) == "cases" + + def test_classify_session_summary_multi_word_slug(self, tmp_path): + """YYYY-MM-DD-multi-word-slug.md still maps to cases.""" + p = tmp_path / "2026-03-15-api-design-review.md" + p.touch() + assert classify_file(p) == "cases" + + def test_classify_unknown_defaults_to_entities(self, tmp_path): + """Any other .md filename falls back to entities.""" + p = tmp_path / "random-notes.md" + p.touch() + assert classify_file(p) == "entities" + + def test_classify_category_override(self, tmp_path): + """--category flag overrides all classification rules.""" + for name in ("MEMORY.md", "2026-04-07.md", "2026-04-07-notes.md", "other.md"): + p = tmp_path / name + p.touch() + assert classify_file(p, category_override="preferences") == "preferences" + + def test_classify_category_override_events(self, tmp_path): + """Override to events works for a root memory file.""" + p = tmp_path / "MEMORY.md" + p.touch() + assert classify_file(p, category_override="events") == "events" + + +# =========================================================================== +# Abstract builder +# =========================================================================== + + +class TestBuildAbstract: + def test_build_abstract_first_line(self): + """abstract = first non-empty line.""" + content = "# My Project Notes\n\nSome details here." + assert build_abstract(content) == "# My Project Notes" + + def test_build_abstract_skips_leading_blank_lines(self): + """Leading blank lines are skipped; first non-empty line is used.""" + content = "\n\n\nActual first line\nSecond line" + assert build_abstract(content) == "Actual first line" + + def test_build_abstract_truncated_to_100(self): + """abstract is capped at 100 characters.""" + long_line = "A" * 150 + assert build_abstract(long_line) == "A" * 100 + + def test_build_abstract_fallback_all_blank(self): + """If content is only whitespace/blank lines, use first 100 raw chars.""" + content = " \n\n \n\nSome content here" + result = build_abstract(content) + # Should skip whitespace-only lines and find "Some content here" + assert result == "Some content here" + + def test_build_abstract_empty_string(self): + """Empty content returns empty string (no crash).""" + assert build_abstract("") == "" + + def test_build_abstract_single_line_no_newline(self): + """Single line without trailing newline works correctly.""" + assert build_abstract("Hello world") == "Hello world" + + +# =========================================================================== +# Overview builder +# =========================================================================== + + +class TestBuildOverview: + def test_overview_first_five_lines(self): + """overview = first 5 non-empty lines joined.""" + lines = [f"Line {i}" for i in range(1, 9)] + content = "\n".join(lines) + overview = build_overview(content) + assert overview == "\n".join(lines[:5]) + + def test_overview_truncated_to_500(self): + """overview is capped at 500 characters.""" + long_line = "A" * 600 + assert len(build_overview(long_line)) <= 500 + + def test_overview_skips_blank_lines(self): + """Blank lines are ignored when collecting up to 5 lines.""" + content = "First\n\nSecond\n\nThird\n\nFourth\n\nFifth\n\nSixth" + overview = build_overview(content) + assert "Sixth" not in overview + assert "First" in overview and "Fifth" in overview + + +# =========================================================================== +# File discovery +# =========================================================================== + + +class TestDiscoverFiles: + def test_discovers_memory_md(self, tmp_path): + """MEMORY.md at workspace root is discovered.""" + (tmp_path / "MEMORY.md").write_text("# Root memory") + files = discover_files(tmp_path, None) + names = [f.path.name for f in files] + assert "MEMORY.md" in names + + def test_discovers_memory_subdir(self, tmp_path): + """Files under memory/ subdirectory are discovered.""" + mem = tmp_path / "memory" + mem.mkdir() + (mem / "2026-04-01.md").write_text("April log") + files = discover_files(tmp_path, None) + names = [f.path.name for f in files] + assert "2026-04-01.md" in names + + def test_empty_dir_returns_nothing(self, tmp_path): + """An empty workspace yields no files.""" + assert discover_files(tmp_path, None) == [] + + def test_nonexistent_dir_returns_nothing(self, tmp_path): + """A missing workspace path returns an empty list.""" + missing = tmp_path / "does-not-exist" + assert discover_files(missing, None) == [] + + def test_category_override_applied(self, tmp_path): + """All discovered files get the override category.""" + (tmp_path / "MEMORY.md").write_text("# mem") + mem = tmp_path / "memory" + mem.mkdir() + (mem / "2026-04-07.md").write_text("daily") + files = discover_files(tmp_path, "preferences") + assert all(f.category == "preferences" for f in files) + + +# =========================================================================== +# Dry-run — no writes +# =========================================================================== + + +class TestDryRun: + def test_dry_run_no_writes(self, tmp_path, monkeypatch): + """--dry-run flag must not call _migrate_async (zero writes).""" + import migrate + + (tmp_path / "MEMORY.md").write_text("# hello") + + # Patch sys.argv so _parse_args() picks up --dry-run and the tmp dir + monkeypatch.setattr( + sys, + "argv", + ["migrate.py", "--openclaw-dir", str(tmp_path), "--dry-run"], + ) + + # Patch asyncio.run — if main() calls it, _migrate_async was reached + with patch("asyncio.run") as mock_run: + migrate.main() + + mock_run.assert_not_called() + + def test_dry_run_output_lists_files(self, tmp_path, capsys): + """--dry-run prints each file with its category and char count.""" + from migrate import print_dry_run + + (tmp_path / "MEMORY.md").write_text("curated knowledge") + mem_dir = tmp_path / "memory" + mem_dir.mkdir() + (mem_dir / "2026-04-07.md").write_text("daily log entry") + + files = discover_files(tmp_path, None) + print_dry_run(files, tmp_path) + + out = capsys.readouterr().out + assert "DRY RUN" in out + assert "entities" in out + assert "events" in out + assert "0 LLM calls" in out