diff --git a/dataclaw/cli.py b/dataclaw/cli.py index a7957c5..25f7d5f 100644 --- a/dataclaw/cli.py +++ b/dataclaw/cli.py @@ -12,7 +12,7 @@ from .anonymizer import Anonymizer from .config import CONFIG_FILE, DataClawConfig, load_config, save_config -from .parser import CLAUDE_DIR, CODEX_DIR, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions +from .parser import CLAUDE_DIR, CLINE_TASKS_DIR, CODEX_DIR, CUSTOM_DIR, GEMINI_DIR, KIMI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions from .secrets import _has_mixed_char_types, _shannon_entropy, redact_session HF_TAG = "dataclaw" @@ -51,15 +51,15 @@ SETUP_TO_PUBLISH_STEPS = [ "Step 1/6: Run prep/list to review project scope: dataclaw prep && dataclaw list", - "Step 2/6: Explicitly choose source scope: dataclaw config --source ", + "Step 2/6: Explicitly choose source scope: dataclaw config --source ", "Step 3/6: Configure exclusions/redactions and confirm projects: dataclaw config ...", "Step 4/6: Export locally only: dataclaw export --no-push --output /tmp/dataclaw_export.jsonl", "Step 5/6: Review and confirm: dataclaw confirm ...", "Step 6/6: After explicit user approval, publish: dataclaw export --publish-attestation \"User explicitly approved publishing to Hugging Face.\"", ] -EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all", "both"} -SOURCE_CHOICES = ["auto", "claude", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all"] +EXPLICIT_SOURCE_CHOICES = {"claude", "cline", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all", "both"} +SOURCE_CHOICES = ["auto", "claude", "cline", "codex", "custom", "gemini", "kimi", "opencode", "openclaw", "all"] def _mask_secret(s: str) -> str: @@ -91,9 +91,11 @@ def _source_label(source_filter: str) -> str: return "OpenClaw" if source_filter == "kimi": return "Kimi CLI" + if source_filter == "cline": + return "Cline" if source_filter == "custom": return "Custom" - return "Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom" + return "Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, Cline, or Custom" def _normalize_source_filter(source_filter: str) -> str: @@ -114,7 +116,7 @@ def _resolve_source_choice( Returns: (source_choice, explicit) where source_choice is one of - "claude" | "codex" | "gemini" | "opencode" | "openclaw" | "all" | "auto". + "claude" | "cline" | "codex" | "custom" | "gemini" | "kimi" | "opencode" | "openclaw" | "all" | "auto". """ if _is_explicit_source_choice(requested_source): return requested_source, True @@ -139,9 +141,11 @@ def _has_session_sources(source_filter: str = "auto") -> bool: return OPENCLAW_DIR.exists() if source_filter == "kimi": return KIMI_DIR.exists() + if source_filter == "cline": + return CLINE_TASKS_DIR.exists() if source_filter == "custom": return CUSTOM_DIR.exists() - return CLAUDE_DIR.exists() or CODEX_DIR.exists() or CUSTOM_DIR.exists() or GEMINI_DIR.exists() or KIMI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists() + return CLAUDE_DIR.exists() or CLINE_TASKS_DIR.exists() or CODEX_DIR.exists() or CUSTOM_DIR.exists() or GEMINI_DIR.exists() or KIMI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists() def _filter_projects_by_source(projects: list[dict], source_filter: str) -> list[dict]: @@ -228,13 +232,13 @@ def _build_status_next_steps( if not source_confirmed: steps.append( "Ask the user to explicitly choose export source scope: Claude Code, Codex, Gemini, or all. " - "Then set it: dataclaw config --source . " + "Then set it: dataclaw config --source . " "Do not run export until source scope is explicitly confirmed." ) else: steps.append( f"Source scope is currently set to '{configured_source}'. " - "If the user wants a different scope, run: dataclaw config --source ." + "If the user wants a different scope, run: dataclaw config --source ." ) if not projects_confirmed: steps.append( @@ -478,8 +482,10 @@ def _build_dataset_card(repo_id: str, meta: dict) -> str: tags: - dataclaw - claude-code + - cline - codex-cli - gemini-cli + - kimi-cli - opencode - openclaw - conversations @@ -1119,15 +1125,20 @@ def prep(source_filter: str = "auto") -> None: effective_source_filter = _normalize_source_filter(resolved_source_choice) if not _has_session_sources(effective_source_filter): - if effective_source_filter == "claude": - err = "~/.claude was not found." - elif effective_source_filter == "codex": - err = "~/.codex was not found." - elif effective_source_filter == "gemini": - from .parser import GEMINI_DIR - err = f"{GEMINI_DIR} was not found." + source_paths = { + "claude": "~/.claude", + "cline": "~/.cline/data/tasks", + "codex": "~/.codex", + "gemini": "~/.gemini/tmp", + "kimi": "~/.kimi", + "opencode": "~/.local/share/opencode", + "openclaw": "~/.openclaw", + "custom": "~/.dataclaw/custom", + } + if effective_source_filter in source_paths: + err = f"{source_paths[effective_source_filter]} was not found." else: - err = "None of ~/.claude, ~/.codex, or ~/.gemini/tmp were found." + err = "No supported coding agent data directories were found." print(json.dumps({"error": err})) sys.exit(1) @@ -1341,8 +1352,8 @@ def _run_export(args) -> None: ), "required_action": ( "Ask the user whether to export Claude Code, Codex, Gemini, or all. " - "Then run `dataclaw config --source ` " - "or pass `--source ` on the export command." + "Then run `dataclaw config --source ` " + "or pass `--source ` on the export command." ), "allowed_sources": sorted(EXPLICIT_SOURCE_CHOICES), "blocked_on_step": "Step 2/6", @@ -1423,19 +1434,11 @@ def _run_export(args) -> None: save_config(config) print("=" * 50) - print(" DataClaw — Claude/Codex Log Exporter") + print(" DataClaw — Coding Agent Log Exporter") print("=" * 50) if not _has_session_sources(source_filter): - if source_filter == "claude": - print(f"Error: {CLAUDE_DIR} not found.", file=sys.stderr) - elif source_filter == "codex": - print(f"Error: {CODEX_DIR} not found.", file=sys.stderr) - elif source_filter == "gemini": - from .parser import GEMINI_DIR - print(f"Error: {GEMINI_DIR} not found.", file=sys.stderr) - else: - print("Error: none of ~/.claude, ~/.codex, or ~/.gemini/tmp were found.", file=sys.stderr) + print(f"Error: no {_source_label(source_filter)} data directory found.", file=sys.stderr) sys.exit(1) projects = _filter_projects_by_source(discover_projects(), source_filter) diff --git a/dataclaw/parser.py b/dataclaw/parser.py index 50cc39e..585eda3 100644 --- a/dataclaw/parser.py +++ b/dataclaw/parser.py @@ -1,4 +1,4 @@ -"""Parse Claude Code, Codex, Gemini CLI, OpenCode, and OpenClaw session data into conversations.""" +"""Parse Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, and Cline session data into conversations.""" import dataclasses import hashlib @@ -20,6 +20,7 @@ OPENCODE_SOURCE = "opencode" OPENCLAW_SOURCE = "openclaw" KIMI_SOURCE = "kimi" +CLINE_SOURCE = "cline" CUSTOM_SOURCE = "custom" CLAUDE_DIR = Path.home() / ".claude" @@ -45,6 +46,19 @@ KIMI_CONFIG_PATH = KIMI_DIR / "kimi.json" UNKNOWN_KIMI_CWD = "" +CLINE_DIR = Path.home() / ".cline" / "data" +CLINE_TASKS_DIR = CLINE_DIR / "tasks" +CLINE_STATE_DIR = CLINE_DIR / "state" +UNKNOWN_CLINE_CWD = "" +_CLINE_LEGACY_DIRS = [ + Path.home() / "Library" / "Application Support" / "Code" / "User" + / "globalStorage" / "saoudrizwan.claude-dev", + Path.home() / ".config" / "Code" / "User" + / "globalStorage" / "saoudrizwan.claude-dev", + Path.home() / "AppData" / "Roaming" / "Code" / "User" + / "globalStorage" / "saoudrizwan.claude-dev", +] + CUSTOM_DIR = Path.home() / ".dataclaw" / "custom" _CODEX_PROJECT_INDEX: dict[str, list[Path]] = {} @@ -52,6 +66,7 @@ _OPENCODE_PROJECT_INDEX: dict[str, list[str]] = {} _OPENCLAW_PROJECT_INDEX: dict[str, list[Path]] = {} _KIMI_PROJECT_INDEX: dict[str, list[Path]] = {} +_CLINE_PROJECT_INDEX: dict[str, list[Path]] = {} def _build_gemini_hash_map() -> dict[str, str]: @@ -140,6 +155,7 @@ def discover_projects() -> list[dict]: projects.extend(_discover_opencode_projects()) projects.extend(_discover_openclaw_projects()) projects.extend(_discover_kimi_projects()) + projects.extend(_discover_cline_projects()) projects.extend(_discover_custom_projects()) return sorted(projects, key=lambda p: (p["display_name"], p["source"])) @@ -334,6 +350,311 @@ def _build_kimi_project_name(cwd: str) -> str: return f"kimi:{Path(cwd).name or cwd}" +def _load_cline_task_history() -> list[dict]: + """Load Cline task history with fallback chain.""" + # 1. Primary: ~/.cline/data/state/taskHistory.json + history_file = CLINE_STATE_DIR / "taskHistory.json" + if history_file.exists(): + try: + data = json.loads(history_file.read_text()) + if isinstance(data, list): + return data + except (json.JSONDecodeError, OSError): + pass + + # 2. Fallback: ~/.cline/data/globalState.json key "taskHistory" + global_state = CLINE_DIR / "globalState.json" + if global_state.exists(): + try: + data = json.loads(global_state.read_text()) + history = data.get("taskHistory", []) + if isinstance(history, list): + return history + except (json.JSONDecodeError, OSError): + pass + + # 3. Fallback: legacy VS Code globalStorage globalState.json + for legacy_dir in _CLINE_LEGACY_DIRS: + legacy_state = legacy_dir / "globalState.json" + if legacy_state.exists(): + try: + data = json.loads(legacy_state.read_text()) + history = data.get("taskHistory", []) + if isinstance(history, list): + return history + except (json.JSONDecodeError, OSError): + continue + + return [] + + +def _get_cline_task_dirs() -> list[Path]: + """Collect all Cline task dirs from primary + legacy locations, deduplicate by task ID.""" + seen_ids: set[str] = set() + task_dirs: list[Path] = [] + + # Primary location first (preferred) + if CLINE_TASKS_DIR.exists(): + for task_dir in sorted(CLINE_TASKS_DIR.iterdir()): + if task_dir.is_dir() and (task_dir / "api_conversation_history.json").exists(): + seen_ids.add(task_dir.name) + task_dirs.append(task_dir) + + # Legacy locations (only tasks not already found) + for legacy_dir in _CLINE_LEGACY_DIRS: + legacy_tasks = legacy_dir / "tasks" + if not legacy_tasks.exists(): + continue + for task_dir in sorted(legacy_tasks.iterdir()): + if task_dir.is_dir() and task_dir.name not in seen_ids: + if (task_dir / "api_conversation_history.json").exists(): + seen_ids.add(task_dir.name) + task_dirs.append(task_dir) + + return task_dirs + + +def _discover_cline_projects() -> list[dict]: + """Discover Cline projects grouped by working directory.""" + index = _get_cline_project_index(refresh=True) + if not index: + return [] + + projects = [] + for cwd, dirs in sorted(index.items()): + total_size = 0 + for d in dirs: + conv_file = d / "api_conversation_history.json" + try: + total_size += conv_file.stat().st_size + except OSError: + pass + projects.append( + { + "dir_name": cwd, + "display_name": _build_cline_project_name(cwd), + "session_count": len(dirs), + "total_size_bytes": total_size, + "source": CLINE_SOURCE, + } + ) + return projects + + +def _get_cline_project_index(refresh: bool = False) -> dict[str, list[Path]]: + global _CLINE_PROJECT_INDEX + if refresh or not _CLINE_PROJECT_INDEX: + _CLINE_PROJECT_INDEX = _build_cline_project_index() + return _CLINE_PROJECT_INDEX + + +def _build_cline_project_index() -> dict[str, list[Path]]: + """Map CWD -> [task_dir_paths] for Cline tasks.""" + task_dirs = _get_cline_task_dirs() + history = _load_cline_task_history() + id_to_cwd: dict[str, str] = {} + for item in history: + task_id = item.get("id") + cwd = item.get("cwdOnTaskInitialization") + if isinstance(task_id, str) and isinstance(cwd, str) and cwd.strip(): + id_to_cwd[task_id] = cwd + + index: dict[str, list[Path]] = {} + for task_dir in task_dirs: + cwd = id_to_cwd.get(task_dir.name, UNKNOWN_CLINE_CWD) + index.setdefault(cwd, []).append(task_dir) + return index + + +def _build_cline_project_name(cwd: str) -> str: + if cwd == UNKNOWN_CLINE_CWD: + return "cline:unknown" + return f"cline:{Path(cwd).name or cwd}" + + +def _build_cline_tool_result_map(messages: list[dict[str, Any]], anonymizer: Anonymizer) -> dict[str, dict]: + """Build tool_use_id -> {output, status} map from user tool_result blocks in Cline format.""" + result: dict[str, dict] = {} + for msg in messages: + if msg.get("role") != "user": + continue + content = msg.get("content", []) + if not isinstance(content, list): + continue + for block in content: + if not isinstance(block, dict) or block.get("type") != "tool_result": + continue + tid = block.get("tool_use_id") + if not tid: + continue + is_error = bool(block.get("is_error")) + raw_content = block.get("content", "") + if isinstance(raw_content, list): + text = "\n\n".join( + part.get("text", "") for part in raw_content + if isinstance(part, dict) and part.get("type") == "text" + ).strip() + else: + text = str(raw_content).strip() if raw_content else "" + result[tid] = { + "output": {"text": anonymizer.text(text)} if text else {}, + "status": "error" if is_error else "success", + } + return result + + +def _parse_cline_session( + task_dir: Path, + anonymizer: Anonymizer, + include_thinking: bool = True, +) -> dict | None: + """Parse a Cline task directory into structured session data.""" + conv_file = task_dir / "api_conversation_history.json" + if not conv_file.exists(): + return None + + try: + raw = json.loads(conv_file.read_text()) + except (json.JSONDecodeError, OSError): + return None + + if not isinstance(raw, list): + return None + + metadata: dict[str, Any] = { + "session_id": task_dir.name, + "cwd": None, + "git_branch": None, + "model": None, + "start_time": None, + "end_time": None, + } + stats = _make_stats() + tool_result_map = _build_cline_tool_result_map(raw, anonymizer) + + messages: list[dict[str, Any]] = [] + for msg in raw: + role = msg.get("role") + ts = msg.get("ts") + timestamp = None + if isinstance(ts, (int, float)) and ts > 0: + try: + timestamp = datetime.fromtimestamp(ts / 1000, tz=timezone.utc).isoformat() + except (ValueError, OSError, OverflowError): + pass + + if role == "user": + content = msg.get("content") + text_parts = [] + + if isinstance(content, str): + text = content.strip() + if text: + text_parts.append(anonymizer.text(text)) + elif isinstance(content, list): + # Skip messages that are only tool_results (handled via tool_result_map) + has_non_tool_result = False + for block in content: + if not isinstance(block, dict): + continue + block_type = block.get("type") + if block_type == "text": + text = block.get("text", "").strip() + if text: + text_parts.append(anonymizer.text(text)) + has_non_tool_result = True + elif block_type != "tool_result": + has_non_tool_result = True + if not has_non_tool_result and not text_parts: + continue + + if text_parts: + messages.append({ + "role": "user", + "content": "\n\n".join(text_parts), + "timestamp": timestamp, + }) + stats["user_messages"] += 1 + if not metadata["start_time"] and timestamp: + metadata["start_time"] = timestamp + + elif role == "assistant": + text_parts = [] + thinking_parts = [] + tool_uses = [] + + # Extract model info + model_info = msg.get("modelInfo") + if isinstance(model_info, dict) and not metadata["model"]: + model_id = model_info.get("modelId") + if isinstance(model_id, str) and model_id: + metadata["model"] = model_id + + # Extract token stats + metrics = msg.get("metrics") + if isinstance(metrics, dict): + tokens = metrics.get("tokens") + if isinstance(tokens, dict): + prompt = tokens.get("prompt", 0) + completion = tokens.get("completion", 0) + if isinstance(prompt, (int, float)): + stats["input_tokens"] += int(prompt) + if isinstance(completion, (int, float)): + stats["output_tokens"] += int(completion) + + content = msg.get("content") + if isinstance(content, str): + text = content.strip() + if text: + text_parts.append(anonymizer.text(text)) + elif isinstance(content, list): + for block in content: + if not isinstance(block, dict): + continue + block_type = block.get("type") + if block_type == "text": + text = block.get("text", "").strip() + if text: + text_parts.append(anonymizer.text(text)) + elif block_type == "thinking" and include_thinking: + think = block.get("thinking", "").strip() + if think: + thinking_parts.append(anonymizer.text(think)) + elif block_type == "tool_use": + tool_name = block.get("name", "") + tool_input = block.get("input", {}) + tu: dict[str, Any] = { + "tool": tool_name, + "input": _parse_tool_input(tool_name, tool_input, anonymizer), + } + tid = block.get("id") + if tid and tid in tool_result_map: + tr = tool_result_map[tid] + tu["output"] = tr.get("output", {}) + tu["status"] = tr.get("status", "success") + tool_uses.append(tu) + + if not text_parts and not thinking_parts and not tool_uses: + continue + + out_msg: dict[str, Any] = {"role": "assistant"} + if text_parts: + out_msg["content"] = "\n\n".join(text_parts) + if thinking_parts: + out_msg["thinking"] = "\n\n".join(thinking_parts) + if tool_uses: + out_msg["tool_uses"] = tool_uses + stats["tool_uses"] += len(tool_uses) + if timestamp: + out_msg["timestamp"] = timestamp + metadata["end_time"] = timestamp + + messages.append(out_msg) + stats["assistant_messages"] += 1 + + return _make_session_result(metadata, messages, stats) + + def _discover_custom_projects() -> list[dict]: if not CUSTOM_DIR.exists(): return [] @@ -427,6 +748,20 @@ def parse_project_sessions( if source == CUSTOM_SOURCE: return _parse_custom_sessions(project_dir_name, anonymizer) + if source == CLINE_SOURCE: + index = _get_cline_project_index() + task_dirs = index.get(project_dir_name, []) + sessions = [] + for task_dir in task_dirs: + parsed = _parse_cline_session(task_dir, anonymizer, include_thinking) + if parsed and parsed["messages"]: + parsed["project"] = _build_cline_project_name(project_dir_name) + parsed["source"] = CLINE_SOURCE + if not parsed.get("model"): + parsed["model"] = "unknown" + sessions.append(parsed) + return sessions + if source == KIMI_SOURCE: project_hash = _get_kimi_project_hash(project_dir_name) project_path = KIMI_SESSIONS_DIR / project_hash diff --git a/tests/test_cli.py b/tests/test_cli.py index f8212f7..d65086a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -400,7 +400,7 @@ def test_no_projects(self, monkeypatch, capsys): monkeypatch.setattr("dataclaw.cli.discover_projects", lambda: []) list_projects() captured = capsys.readouterr() - assert "No Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, or Custom sessions" in captured.out + assert "No Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, Kimi CLI, Cline, or Custom sessions" in captured.out def test_source_filter_codex(self, monkeypatch, capsys): monkeypatch.setattr( @@ -626,7 +626,7 @@ def test_export_requires_explicit_source_selection(self, monkeypatch, capsys): assert payload["error"] == "Source scope is not confirmed yet." assert payload["blocked_on_step"] == "Step 2/6" assert len(payload["process_steps"]) == 6 - assert payload["allowed_sources"] == ["all", "both", "claude", "codex", "custom", "gemini", "kimi", "openclaw", "opencode"] + assert payload["allowed_sources"] == ["all", "both", "claude", "cline", "codex", "custom", "gemini", "kimi", "openclaw", "opencode"] assert payload["next_command"] == "dataclaw config --source all" def test_configure_next_steps_require_full_folder_presentation(self): diff --git a/tests/test_parser.py b/tests/test_parser.py index e5ffdad..235eefa 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -13,6 +13,7 @@ _extract_user_content, _find_subagent_only_sessions, _normalize_timestamp, + _parse_cline_session, _parse_session_file, _parse_subagent_session, _parse_tool_input, @@ -438,6 +439,8 @@ def _disable_codex(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents") monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") + monkeypatch.setattr("dataclaw.parser.CLINE_TASKS_DIR", tmp_path / "no-cline-tasks") + monkeypatch.setattr("dataclaw.parser._CLINE_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom") def _write_opencode_db(self, db_path): @@ -990,6 +993,8 @@ def _disable_codex(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents") monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") + monkeypatch.setattr("dataclaw.parser.CLINE_TASKS_DIR", tmp_path / "no-cline-tasks") + monkeypatch.setattr("dataclaw.parser._CLINE_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom") def test_discover_includes_subagent_sessions(self, tmp_path, monkeypatch, mock_anonymizer): @@ -1590,6 +1595,8 @@ def _disable_others(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") + monkeypatch.setattr("dataclaw.parser.CLINE_TASKS_DIR", tmp_path / "no-cline-tasks") + monkeypatch.setattr("dataclaw.parser._CLINE_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom") def test_discover_openclaw_projects(self, tmp_path, monkeypatch, mock_anonymizer): @@ -1677,6 +1684,8 @@ def _disable_others(self, tmp_path, monkeypatch): monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents") monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") + monkeypatch.setattr("dataclaw.parser.CLINE_TASKS_DIR", tmp_path / "no-cline-tasks") + monkeypatch.setattr("dataclaw.parser._CLINE_PROJECT_INDEX", {}) def _make_valid_session(self, session_id="s1", model="gpt-4", content="hello"): return json.dumps({ @@ -1778,3 +1787,198 @@ def test_parse_nonexistent_project(self, tmp_path, monkeypatch, mock_anonymizer) monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", custom_dir) sessions = parse_project_sessions("nope", mock_anonymizer, source="custom") assert sessions == [] + + +class TestClineParser: + """Tests for Cline source discovery and parsing.""" + + def _disable_others(self, tmp_path, monkeypatch): + monkeypatch.setattr("dataclaw.parser.PROJECTS_DIR", tmp_path / "no-claude") + monkeypatch.setattr("dataclaw.parser.CODEX_SESSIONS_DIR", tmp_path / "no-codex-sessions") + monkeypatch.setattr("dataclaw.parser.CODEX_ARCHIVED_DIR", tmp_path / "no-codex-archived") + monkeypatch.setattr("dataclaw.parser._CODEX_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini") + monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db") + monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents") + monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {}) + monkeypatch.setattr("dataclaw.parser.KIMI_SESSIONS_DIR", tmp_path / "no-kimi-sessions") + monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom") + + def _setup_cline_dirs(self, tmp_path, monkeypatch): + """Set up Cline directory structure pointing to tmp_path.""" + cline_dir = tmp_path / "cline-data" + tasks_dir = cline_dir / "tasks" + state_dir = cline_dir / "state" + tasks_dir.mkdir(parents=True) + state_dir.mkdir(parents=True) + monkeypatch.setattr("dataclaw.parser.CLINE_DIR", cline_dir) + monkeypatch.setattr("dataclaw.parser.CLINE_TASKS_DIR", tasks_dir) + monkeypatch.setattr("dataclaw.parser.CLINE_STATE_DIR", state_dir) + monkeypatch.setattr("dataclaw.parser._CLINE_LEGACY_DIRS", []) + monkeypatch.setattr("dataclaw.parser._CLINE_PROJECT_INDEX", {}) + return cline_dir, tasks_dir, state_dir + + def _write_task(self, tasks_dir, task_id, messages): + """Write a Cline task with api_conversation_history.json.""" + task_dir = tasks_dir / task_id + task_dir.mkdir(parents=True, exist_ok=True) + (task_dir / "api_conversation_history.json").write_text(json.dumps(messages)) + return task_dir + + def _write_task_history(self, state_dir, items): + """Write taskHistory.json.""" + (state_dir / "taskHistory.json").write_text(json.dumps(items)) + + def test_discover_cline_projects(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + cline_dir, tasks_dir, state_dir = self._setup_cline_dirs(tmp_path, monkeypatch) + + # Create two tasks in the same project + self._write_task(tasks_dir, "task-1", [ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ]) + self._write_task(tasks_dir, "task-2", [ + {"role": "user", "content": "bye"}, + {"role": "assistant", "content": "goodbye"}, + ]) + # One task in a different project + self._write_task(tasks_dir, "task-3", [ + {"role": "user", "content": "other"}, + ]) + + self._write_task_history(state_dir, [ + {"id": "task-1", "cwdOnTaskInitialization": "/home/user/project-a"}, + {"id": "task-2", "cwdOnTaskInitialization": "/home/user/project-a"}, + {"id": "task-3", "cwdOnTaskInitialization": "/home/user/project-b"}, + ]) + + projects = discover_projects() + cline_projects = [p for p in projects if p["source"] == "cline"] + assert len(cline_projects) == 2 + names = {p["display_name"] for p in cline_projects} + assert names == {"cline:project-a", "cline:project-b"} + # project-a should have 2 sessions + proj_a = next(p for p in cline_projects if p["display_name"] == "cline:project-a") + assert proj_a["session_count"] == 2 + + def test_parse_cline_session_basic(self, tmp_path, monkeypatch, mock_anonymizer): + cline_dir, tasks_dir, state_dir = self._setup_cline_dirs(tmp_path, monkeypatch) + task_dir = self._write_task(tasks_dir, "task-basic", [ + {"role": "user", "content": "Write a function", "ts": 1700000000000}, + { + "role": "assistant", + "content": "Here is the function", + "ts": 1700000010000, + "modelInfo": {"modelId": "claude-sonnet-4-20250514", "providerId": "anthropic"}, + "metrics": {"tokens": {"prompt": 100, "completion": 50}}, + }, + ]) + + result = _parse_cline_session(task_dir, mock_anonymizer) + assert result is not None + assert result["session_id"] == "task-basic" + assert result["model"] == "claude-sonnet-4-20250514" + assert len(result["messages"]) == 2 + assert result["messages"][0]["role"] == "user" + assert result["messages"][1]["role"] == "assistant" + assert result["stats"]["user_messages"] == 1 + assert result["stats"]["assistant_messages"] == 1 + assert result["stats"]["input_tokens"] == 100 + assert result["stats"]["output_tokens"] == 50 + + def test_parse_cline_session_with_tools(self, tmp_path, monkeypatch, mock_anonymizer): + cline_dir, tasks_dir, state_dir = self._setup_cline_dirs(tmp_path, monkeypatch) + task_dir = self._write_task(tasks_dir, "task-tools", [ + {"role": "user", "content": "Read the file"}, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Let me read that file."}, + {"type": "tool_use", "id": "tu-1", "name": "read_file", "input": {"path": "/tmp/test.py"}}, + ], + }, + { + "role": "user", + "content": [ + {"type": "tool_result", "tool_use_id": "tu-1", "content": "file contents here"}, + ], + }, + {"role": "assistant", "content": "The file contains test code."}, + ]) + + result = _parse_cline_session(task_dir, mock_anonymizer) + assert result is not None + assert len(result["messages"]) == 3 # user, assistant w/ tool, assistant + tool_msg = result["messages"][1] + assert len(tool_msg["tool_uses"]) == 1 + assert tool_msg["tool_uses"][0]["tool"] == "read_file" + assert tool_msg["tool_uses"][0]["status"] == "success" + assert result["stats"]["tool_uses"] == 1 + + def test_parse_cline_session_with_thinking(self, tmp_path, monkeypatch, mock_anonymizer): + cline_dir, tasks_dir, state_dir = self._setup_cline_dirs(tmp_path, monkeypatch) + task_dir = self._write_task(tasks_dir, "task-think", [ + {"role": "user", "content": "Solve this problem"}, + { + "role": "assistant", + "content": [ + {"type": "thinking", "thinking": "Let me think about this..."}, + {"type": "text", "text": "Here is the solution."}, + ], + }, + ]) + + result = _parse_cline_session(task_dir, mock_anonymizer, include_thinking=True) + assert result is not None + assert result["messages"][1].get("thinking") == "Let me think about this..." + + result_no_think = _parse_cline_session(task_dir, mock_anonymizer, include_thinking=False) + assert result_no_think is not None + assert "thinking" not in result_no_think["messages"][1] + + def test_parse_cline_session_no_file(self, tmp_path, mock_anonymizer): + task_dir = tmp_path / "nonexistent-task" + task_dir.mkdir() + result = _parse_cline_session(task_dir, mock_anonymizer) + assert result is None + + def test_cline_project_index_grouping(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + cline_dir, tasks_dir, state_dir = self._setup_cline_dirs(tmp_path, monkeypatch) + + self._write_task(tasks_dir, "t1", [{"role": "user", "content": "a"}]) + self._write_task(tasks_dir, "t2", [{"role": "user", "content": "b"}]) + self._write_task(tasks_dir, "t3", [{"role": "user", "content": "c"}]) + + self._write_task_history(state_dir, [ + {"id": "t1", "cwdOnTaskInitialization": "/proj/alpha"}, + {"id": "t2", "cwdOnTaskInitialization": "/proj/alpha"}, + {"id": "t3", "cwdOnTaskInitialization": "/proj/beta"}, + ]) + + sessions_alpha = parse_project_sessions("/proj/alpha", mock_anonymizer, source="cline") + sessions_beta = parse_project_sessions("/proj/beta", mock_anonymizer, source="cline") + assert len(sessions_alpha) == 2 + assert len(sessions_beta) == 1 + + def test_cline_unknown_cwd_fallback(self, tmp_path, monkeypatch, mock_anonymizer): + self._disable_others(tmp_path, monkeypatch) + cline_dir, tasks_dir, state_dir = self._setup_cline_dirs(tmp_path, monkeypatch) + + # Tasks exist but no taskHistory.json + self._write_task(tasks_dir, "orphan-1", [ + {"role": "user", "content": "hello"}, + {"role": "assistant", "content": "hi"}, + ]) + self._write_task(tasks_dir, "orphan-2", [ + {"role": "user", "content": "bye"}, + {"role": "assistant", "content": "goodbye"}, + ]) + + projects = discover_projects() + cline_projects = [p for p in projects if p["source"] == "cline"] + assert len(cline_projects) == 1 + assert cline_projects[0]["display_name"] == "cline:unknown" + assert cline_projects[0]["session_count"] == 2