diff --git a/README.md b/README.md index 4d6bdaf..9316ef7 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ > **This is a performance art project.** Anthropic built their models on the world's freely shared information, then introduced increasingly [dystopian data policies](https://www.anthropic.com/news/detecting-and-preventing-distillation-attacks) to stop anyone else from doing the same with their data — pulling up the ladder behind them. DataClaw lets you throw the ladder back down. The dataset it produces is yours to share. -Turn your Claude Code, Codex, and Gemini CLI conversation history into structured data and publish it to Hugging Face with a single command. DataClaw parses session logs, redacts secrets and PII, and uploads the result as a ready-to-use dataset. +Turn your Claude Code, Codex, Gemini CLI, and OpenCode conversation history into structured data and publish it to Hugging Face with a single command. DataClaw parses session logs, redacts secrets and PII, and uploads the result as a ready-to-use dataset. ![DataClaw](dataclaw.jpeg) @@ -10,7 +10,7 @@ Every export is tagged **`dataclaw`** on Hugging Face. Together, they may someda ## Give this to your agent -Paste this into Claude Code, Codex, or any coding agent: +Paste this into Claude Code, Codex, Gemini CLI, OpenCode, or any coding agent: ``` Help me export my Claude Code, Codex, Gemini CLI, and OpenCode conversation history to Hugging Face using DataClaw. @@ -143,8 +143,7 @@ dataclaw export --publish-attestation "User explicitly approved publishing to Hu | User messages | Yes | Full text (including voice transcripts) | | Assistant responses | Yes | Full text output | | Extended thinking | Yes | Claude's reasoning (opt out with `--no-thinking`) | -| Tool calls | Yes | Tool name + summarized input | -| Tool results | No | Not stored in session logs | +| Tool calls | Yes | Tool name + inputs + outputs | | Token usage | Yes | Input/output tokens per session | | Model & metadata | Yes | Model name, git branch, timestamps | @@ -158,7 +157,7 @@ DataClaw applies multiple layers of protection: 4. **Entropy analysis** — Long high-entropy strings in quotes are flagged as potential secrets 5. **Email redaction** — Personal email addresses removed 6. **Custom redaction** — You can configure additional strings and usernames to redact -7. **Tool input pre-redaction** — Secrets in tool inputs are redacted BEFORE truncation to prevent partial leaks +7. **Tool call redaction** — Secrets in tool inputs and outputs are redacted **This is NOT foolproof.** Always review your exported data before publishing. Automated redaction cannot catch everything — especially service-specific @@ -187,7 +186,14 @@ Each line in `conversations.jsonl` is one session: "role": "assistant", "content": "I'll investigate the login flow.", "thinking": "The user wants me to look at...", - "tool_uses": [{"tool": "Read", "input": "src/auth.py"}], + "tool_uses": [ + { + "tool": "bash", + "input": {"command": "grep -r 'login' src/"}, + "output": {"text": "src/auth.py:42: def login(user, password):"}, + "status": "success" + } + ], "timestamp": "..." } ], @@ -221,7 +227,7 @@ All repos are named `{username}/my-personal-codex-data` and tagged `dataclaw`. ``` The auto-generated HF README includes: -- Model distribution (which Claude models, how many sessions each) +- Model distribution (which models, how many sessions each) - Total token counts - Project count - Last updated timestamp diff --git a/dataclaw/cli.py b/dataclaw/cli.py index e9adffc..35ee819 100644 --- a/dataclaw/cli.py +++ b/dataclaw/cli.py @@ -468,6 +468,7 @@ def _build_dataset_card(repo_id: str, meta: dict) -> str: - claude-code - codex-cli - gemini-cli + - opencode - conversations - coding-assistant - tool-use @@ -481,7 +482,7 @@ def _build_dataset_card(repo_id: str, meta: dict) -> str: # Coding Agent Conversation Logs -> **This is a performance art project.** Anthropic built their models on the world's freely shared information, then introduced increasingly [dystopian data policies](https://www.anthropic.com/news/detecting-and-preventing-distillation-attacks) to stop anyone else from doing the same — pulling up the ladder behind them. DataClaw lets you throw the ladder back down. The dataset it produces is yours to share. +> **This is a performance art project.** Anthropic built their models on the world's freely shared information, then introduced increasingly [dystopian data policies](https://www.anthropic.com/news/detecting-and-preventing-distillation-attacks) to stop anyone else from doing the same with their data — pulling up the ladder behind them. DataClaw lets you throw the ladder back down. The dataset it produces is yours to share. Exported with [DataClaw]({REPO_URL}). @@ -521,7 +522,14 @@ def _build_dataset_card(repo_id: str, meta: dict) -> str: "role": "assistant", "content": "I'll investigate the login flow.", "thinking": "The user wants me to...", - "tool_uses": [{{"tool": "Read", "input": "src/auth.py"}}], + "tool_uses": [ + {{ + "tool": "bash", + "input": {{"command": "grep -r 'login' src/"}}, + "output": {{"text": "src/auth.py:42: def login(user, password):"}}, + "status": "success" + }} + ], "timestamp": "..." }} ], @@ -538,7 +546,6 @@ def _build_dataset_card(repo_id: str, meta: dict) -> str: ### Privacy - Paths anonymized to project-relative; usernames hashed -- No tool outputs — only tool call inputs (summaries) ## Load diff --git a/dataclaw/parser.py b/dataclaw/parser.py index 12f3889..2108537 100644 --- a/dataclaw/parser.py +++ b/dataclaw/parser.py @@ -414,6 +414,34 @@ def _make_session_result( } +def _build_tool_result_map(entries: list[dict[str, Any]], anonymizer: Anonymizer) -> dict[str, dict]: + """Pre-pass: build a map of tool_use_id -> {output, status} from tool_result blocks.""" + result: dict[str, dict] = {} + for entry in entries: + if entry.get("type") != "user": + continue + for block in entry.get("message", {}).get("content", []): + if not isinstance(block, dict) or block.get("type") != "tool_result": + continue + tid = block.get("tool_use_id") + if not tid: + continue + is_error = bool(block.get("is_error")) + content = block.get("content", "") + if isinstance(content, list): + text = "\n\n".join( + part.get("text", "") for part in content + if isinstance(part, dict) and part.get("type") == "text" + ).strip() + else: + text = str(content).strip() if content else "" + result[tid] = { + "output": {"text": anonymizer.text(text)} if text else {}, + "status": "error" if is_error else "success", + } + return result + + def _parse_claude_session_file( filepath: Path, anonymizer: Anonymizer, include_thinking: bool = True ) -> dict | None: @@ -430,11 +458,14 @@ def _parse_claude_session_file( stats = _make_stats() try: - for entry in _iter_jsonl(filepath): - _process_entry(entry, messages, metadata, stats, anonymizer, include_thinking) + entries = list(_iter_jsonl(filepath)) except OSError: return None + tool_result_map = _build_tool_result_map(entries, anonymizer) + for entry in entries: + _process_entry(entry, messages, metadata, stats, anonymizer, include_thinking, tool_result_map) + return _make_session_result(metadata, messages, stats) @@ -501,12 +532,132 @@ def _parse_subagent_session( } stats = _make_stats() - for _ts, entry in timed_entries: - _process_entry(entry, messages, metadata, stats, anonymizer, include_thinking) + entries = [entry for _ts, entry in timed_entries] + tool_result_map = _build_tool_result_map(entries, anonymizer) + for entry in entries: + _process_entry(entry, messages, metadata, stats, anonymizer, include_thinking, tool_result_map) return _make_session_result(metadata, messages, stats) +def _parse_gemini_tool_call(tc: dict, anonymizer: Anonymizer) -> dict: + """Parse a Gemini tool call into a structured dict with input/output/status.""" + name = tc.get("name") + args = tc.get("args", {}) + status = tc.get("status", "unknown") + result_list = tc.get("result") or [] + + # --- Extract output text from functionResponse --- + output_text: str | None = None + extra_texts: list[str] = [] + for item in result_list: + if not isinstance(item, dict): + continue + if "functionResponse" in item: + resp = item["functionResponse"].get("response", {}) + output_text = resp.get("output") + elif "text" in item: + extra_texts.append(item["text"]) + + # --- Build structured input --- + if name == "read_file": + inp = {"file_path": anonymizer.path(args.get("file_path", ""))} + elif name == "write_file": + inp = { + "file_path": anonymizer.path(args.get("file_path", "")), + "content": anonymizer.text(args.get("content", "")), + } + elif name == "replace": + inp = { + "file_path": anonymizer.path(args.get("file_path", "")), + "old_string": anonymizer.text(args.get("old_string", "")), + "new_string": anonymizer.text(args.get("new_string", "")), + "expected_replacements": args.get("expected_replacements"), + "instruction": anonymizer.text(args.get("instruction", "")) if args.get("instruction") else None, + } + inp = {k: v for k, v in inp.items() if v is not None} + elif name == "run_shell_command": + inp = {"command": anonymizer.text(args.get("command", ""))} + elif name == "read_many_files": + inp = {"paths": [anonymizer.path(p) for p in args.get("paths", [])]} + elif name in ("search_file_content", "grep_search"): + inp = {k: anonymizer.text(str(v)) for k, v in args.items()} + elif name == "list_directory": + inp = {"dir_path": anonymizer.path(args.get("dir_path", ""))} + if args.get("ignore"): + inp["ignore"] = [anonymizer.text(str(p)) for p in args["ignore"]] if isinstance(args["ignore"], list) else anonymizer.text(str(args["ignore"])) + elif name == "glob": + inp = {"pattern": args.get("pattern", "")} + elif name in ("google_web_search", "web_fetch", "codebase_investigator"): + inp = {k: anonymizer.text(str(v)) for k, v in args.items()} + else: + inp = {k: anonymizer.text(str(v)) if isinstance(v, str) else v for k, v in args.items()} + + # --- Build structured output --- + if name == "read_many_files": + # Parse "--- /path/to/file ---\n" blocks from extra text parts + files: list[dict] = [] + for raw in extra_texts: + lines = raw.split("\n") + current_path: str | None = None + content_lines: list[str] = [] + for line in lines: + if line.startswith("--- ") and line.endswith(" ---"): + if current_path is not None: + files.append({ + "path": anonymizer.path(current_path), + "content": anonymizer.text("\n".join(content_lines).strip()), + }) + current_path = line[4:-4].strip() + content_lines = [] + else: + content_lines.append(line) + if current_path is not None: + files.append({ + "path": anonymizer.path(current_path), + "content": anonymizer.text("\n".join(content_lines).strip()), + }) + out: dict = {"files": files} + elif name == "run_shell_command" and output_text: + # Parse "Command: ...\nDirectory: ...\nOutput: ...\nExit Code: ..." format + parsed: dict = {} + current_key: str | None = None + current_val: list[str] = [] + for line in output_text.splitlines(): + for key, prefix in (("command", "Command: "), ("directory", "Directory: "), + ("output", "Output: "), ("exit_code", "Exit Code: ")): + if line.startswith(prefix): + if current_key: + parsed[current_key] = "\n".join(current_val).strip() + current_key = key + current_val = [line[len(prefix):]] + break + else: + if current_key: + current_val.append(line) + if current_key: + parsed[current_key] = "\n".join(current_val).strip() + if "exit_code" in parsed: + try: + parsed["exit_code"] = int(parsed["exit_code"]) + except ValueError: + pass + if "command" in parsed: + parsed["command"] = anonymizer.text(parsed["command"]) + if "directory" in parsed: + parsed["directory"] = anonymizer.path(parsed["directory"]) + if "output" in parsed: + parsed["output"] = anonymizer.text(parsed["output"]) + out = parsed + elif output_text is not None: + out = {"text": anonymizer.text(output_text)} + else: + out = {} + + result: dict = {"tool": name, "input": inp, "output": out, "status": status} + return result + + def _parse_gemini_session_file( filepath: Path, anonymizer: Anonymizer, include_thinking: bool = True ) -> dict | None: @@ -579,12 +730,7 @@ def _parse_gemini_session_file( tool_uses = [] for tc in msg_data.get("toolCalls", []): - tool_name = tc.get("name") - args_data = tc.get("args", {}) - tool_uses.append({ - "tool": tool_name, - "input": _summarize_tool_input(tool_name, args_data, anonymizer) - }) + tool_uses.append(_parse_gemini_tool_call(tc, anonymizer)) if tool_uses: msg["tool_uses"] = tool_uses @@ -609,6 +755,63 @@ class _CodexParseState: raw_cwd: str = UNKNOWN_CODEX_CWD max_input_tokens: int = 0 max_output_tokens: int = 0 + tool_result_map: dict[str, dict] = dataclasses.field(default_factory=dict) + + +def _build_codex_tool_result_map(entries: list[dict[str, Any]], anonymizer: Anonymizer) -> dict[str, dict]: + """Pre-pass: build call_id -> {output, status} from function_call_output and custom_tool_call_output.""" + result: dict[str, dict] = {} + for entry in entries: + if entry.get("type") != "response_item": + continue + p = entry.get("payload", {}) + pt = p.get("type") + call_id = p.get("call_id") + if not call_id: + continue + + if pt == "function_call_output": + raw = p.get("output", "") + # Parse "Exit code: N\nWall time: ...\nOutput:\n..." format + out: dict = {} + lines = raw.splitlines() + output_lines: list[str] = [] + in_output = False + for line in lines: + if line.startswith("Exit code: "): + try: + out["exit_code"] = int(line[len("Exit code: "):].strip()) + except ValueError: + out["exit_code"] = line[len("Exit code: "):].strip() + elif line.startswith("Wall time: "): + out["wall_time"] = line[len("Wall time: "):].strip() + elif line == "Output:": + in_output = True + elif in_output: + output_lines.append(line) + if output_lines: + out["output"] = anonymizer.text("\n".join(output_lines).strip()) + result[call_id] = {"output": out, "status": "success"} + + elif pt == "custom_tool_call_output": + raw = p.get("output", "") + out = {} + try: + parsed = json.loads(raw) + text = parsed.get("output", "") + if text: + out["output"] = anonymizer.text(str(text)) + meta = parsed.get("metadata", {}) + if "exit_code" in meta: + out["exit_code"] = meta["exit_code"] + if "duration_seconds" in meta: + out["duration_seconds"] = meta["duration_seconds"] + except (json.JSONDecodeError, AttributeError): + if raw: + out["output"] = anonymizer.text(raw) + result[call_id] = {"output": out, "status": "success"} + + return result def _parse_codex_session_file( @@ -630,35 +833,39 @@ def _parse_codex_session_file( ) try: - for entry in _iter_jsonl(filepath): - timestamp = _normalize_timestamp(entry.get("timestamp")) - entry_type = entry.get("type") - - if entry_type == "session_meta": - _handle_codex_session_meta(state, entry, filepath, anonymizer) - elif entry_type == "turn_context": - _handle_codex_turn_context(state, entry, anonymizer) - elif entry_type == "response_item": - _handle_codex_response_item(state, entry, anonymizer, include_thinking) - elif entry_type == "event_msg": - payload = entry.get("payload", {}) - event_type = payload.get("type") - if event_type == "token_count": - _handle_codex_token_count(state, payload) - elif event_type == "agent_reasoning" and include_thinking: - thinking = payload.get("text") - if isinstance(thinking, str) and thinking.strip(): - cleaned = anonymizer.text(thinking.strip()) - if cleaned not in state._pending_thinking_seen: - state._pending_thinking_seen.add(cleaned) - state.pending_thinking.append(cleaned) - elif event_type == "user_message": - _handle_codex_user_message(state, payload, timestamp, anonymizer) - elif event_type == "agent_message": - _handle_codex_agent_message(state, payload, timestamp, anonymizer, include_thinking) + entries = list(_iter_jsonl(filepath)) except OSError: return None + state.tool_result_map = _build_codex_tool_result_map(entries, anonymizer) + + for entry in entries: + timestamp = _normalize_timestamp(entry.get("timestamp")) + entry_type = entry.get("type") + + if entry_type == "session_meta": + _handle_codex_session_meta(state, entry, filepath, anonymizer) + elif entry_type == "turn_context": + _handle_codex_turn_context(state, entry, anonymizer) + elif entry_type == "response_item": + _handle_codex_response_item(state, entry, anonymizer, include_thinking) + elif entry_type == "event_msg": + payload = entry.get("payload", {}) + event_type = payload.get("type") + if event_type == "token_count": + _handle_codex_token_count(state, payload) + elif event_type == "agent_reasoning" and include_thinking: + thinking = payload.get("text") + if isinstance(thinking, str) and thinking.strip(): + cleaned = anonymizer.text(thinking.strip()) + if cleaned not in state._pending_thinking_seen: + state._pending_thinking_seen.add(cleaned) + state.pending_thinking.append(cleaned) + elif event_type == "user_message": + _handle_codex_user_message(state, payload, timestamp, anonymizer) + elif event_type == "agent_message": + _handle_codex_agent_message(state, payload, timestamp, anonymizer, include_thinking) + state.stats["input_tokens"] = state.max_input_tokens state.stats["output_tokens"] = state.max_output_tokens @@ -723,7 +930,19 @@ def _handle_codex_response_item( state.pending_tool_uses.append( { "tool": tool_name, - "input": _summarize_tool_input(tool_name, args_data, anonymizer), + "input": _parse_tool_input(tool_name, args_data, anonymizer), + "_call_id": payload.get("call_id"), + } + ) + elif item_type == "custom_tool_call": + tool_name = payload.get("name") + raw_input = payload.get("input", "") + inp = {"patch": anonymizer.text(raw_input)} if isinstance(raw_input, str) else _parse_tool_input(tool_name, raw_input, anonymizer) + state.pending_tool_uses.append( + { + "tool": tool_name, + "input": inp, + "_call_id": payload.get("call_id"), } ) elif item_type == "reasoning" and include_thinking: @@ -768,6 +987,19 @@ def _handle_codex_user_message( _update_time_bounds(state.metadata, timestamp) +def _resolve_codex_tool_uses(state: _CodexParseState) -> list[dict]: + """Attach outputs from tool_result_map and strip internal _call_id field.""" + resolved = [] + for tu in state.pending_tool_uses: + call_id = tu.pop("_call_id", None) + if call_id and call_id in state.tool_result_map: + r = state.tool_result_map[call_id] + tu["output"] = r["output"] + tu["status"] = r["status"] + resolved.append(tu) + return resolved + + def _handle_codex_agent_message( state: _CodexParseState, payload: dict[str, Any], timestamp: str | None, anonymizer: Anonymizer, include_thinking: bool, @@ -779,7 +1011,7 @@ def _handle_codex_agent_message( if state.pending_thinking and include_thinking: msg["thinking"] = "\n\n".join(state.pending_thinking) if state.pending_tool_uses: - msg["tool_uses"] = list(state.pending_tool_uses) + msg["tool_uses"] = _resolve_codex_tool_uses(state) if len(msg) > 1: msg["timestamp"] = timestamp @@ -801,7 +1033,7 @@ def _flush_codex_pending(state: _CodexParseState, timestamp: str | None) -> None if state.pending_thinking: msg["thinking"] = "\n\n".join(state.pending_thinking) if state.pending_tool_uses: - msg["tool_uses"] = list(state.pending_tool_uses) + msg["tool_uses"] = _resolve_codex_tool_uses(state) state.messages.append(msg) state.stats["assistant_messages"] += 1 @@ -905,12 +1137,20 @@ def _extract_opencode_assistant_content( tool_name = part.get("tool") state = part.get("state", {}) tool_input = state.get("input", {}) if isinstance(state, dict) else {} - tool_uses.append( - { - "tool": tool_name, - "input": _summarize_tool_input(tool_name, tool_input, anonymizer), - } - ) + tu: dict[str, Any] = { + "tool": tool_name, + "input": _parse_tool_input(tool_name, tool_input, anonymizer), + } + if isinstance(state, dict): + status = state.get("status") + if isinstance(status, str): + tu["status"] = "success" if status == "completed" else status + output = state.get("output") + if isinstance(output, str) and output: + tu["output"] = {"text": anonymizer.text(output)} + elif output is not None: + tu["output"] = {} + tool_uses.append(tu) if not text_parts and not thinking_parts and not tool_uses: return None @@ -1008,6 +1248,7 @@ def _process_entry( stats: dict[str, int], anonymizer: Anonymizer, include_thinking: bool, + tool_result_map: dict[str, dict] | None = None, ) -> None: entry_type = entry.get("type") @@ -1027,7 +1268,7 @@ def _process_entry( _update_time_bounds(metadata, timestamp) elif entry_type == "assistant": - msg = _extract_assistant_content(entry, anonymizer, include_thinking) + msg = _extract_assistant_content(entry, anonymizer, include_thinking, tool_result_map) if msg: if metadata["model"] is None: metadata["model"] = entry.get("message", {}).get("model") @@ -1054,6 +1295,7 @@ def _extract_user_content(entry: dict[str, Any], anonymizer: Anonymizer) -> str def _extract_assistant_content( entry: dict[str, Any], anonymizer: Anonymizer, include_thinking: bool, + tool_result_map: dict[str, dict] | None = None, ) -> dict[str, Any] | None: msg_data = entry.get("message", {}) content_blocks = msg_data.get("content", []) @@ -1077,10 +1319,16 @@ def _extract_assistant_content( if thinking: thinking_parts.append(anonymizer.text(thinking)) elif block_type == "tool_use": - tool_uses.append({ + tu: dict[str, Any] = { "tool": block.get("name"), - "input": _summarize_tool_input(block.get("name"), block.get("input", {}), anonymizer), - }) + "input": _parse_tool_input(block.get("name"), block.get("input", {}), anonymizer), + } + if tool_result_map is not None: + result = tool_result_map.get(block.get("id", "")) + if result: + tu["output"] = result["output"] + tu["status"] = result["status"] + tool_uses.append(tu) if not text_parts and not tool_uses and not thinking_parts: return None @@ -1095,60 +1343,66 @@ def _extract_assistant_content( return msg -MAX_TOOL_INPUT_LENGTH = 300 - - -def _redact_and_truncate(text: str, anonymizer: Anonymizer) -> str: - """Redact secrets BEFORE truncating to avoid partial secret leaks.""" - text, _ = redact_text(text) - return anonymizer.text(text[:MAX_TOOL_INPUT_LENGTH]) - - -def _summarize_file_path(d: dict, a: Anonymizer) -> str: - return a.path(d.get("file_path", "")) - - -def _summarize_write(d: dict, a: Anonymizer) -> str: - return f"{a.path(d.get('file_path', ''))} ({len(d.get('content', ''))} chars)" - - -def _summarize_bash(d: dict, a: Anonymizer) -> str: - return _redact_and_truncate(d.get("command", ""), a) - - -def _summarize_grep(d: dict, a: Anonymizer) -> str: - pattern, _ = redact_text(d.get("pattern", "")) - return f"pattern={a.text(pattern)} path={a.path(d.get('path', ''))}" - - -def _summarize_glob(d: dict, a: Anonymizer) -> str: - return f"pattern={a.text(d.get('pattern', ''))} path={a.path(d.get('path', ''))}" - - -_TOOL_SUMMARIZERS: dict[str, Any] = { - "read": _summarize_file_path, - "edit": _summarize_file_path, - "write": _summarize_write, - "bash": _summarize_bash, - "grep": _summarize_grep, - "glob": _summarize_glob, - "task": lambda d, a: _redact_and_truncate(d.get("prompt", ""), a), - "websearch": lambda d, _: d.get("query", ""), - "webfetch": lambda d, _: d.get("url", ""), -} - - -def _summarize_tool_input(tool_name: str | None, input_data: Any, anonymizer: Anonymizer) -> str: - """Summarize tool input for export.""" +def _parse_tool_input(tool_name: str | None, input_data: Any, anonymizer: Anonymizer) -> dict: + """Return a structured dict for a tool's input args, with paths/content anonymized.""" if not isinstance(input_data, dict): - return _redact_and_truncate(str(input_data), anonymizer) - - name = tool_name.lower() if tool_name else "" - summarizer = _TOOL_SUMMARIZERS.get(name) - if summarizer is not None: - return summarizer(input_data, anonymizer) - return _redact_and_truncate(str(input_data), anonymizer) - + return {"raw": anonymizer.text(str(input_data))} + + name = (tool_name or "").lower() + + # Claude Code tools + if name in ("read", "edit"): + return {"file_path": anonymizer.path(input_data.get("file_path", ""))} + if name == "write": + return { + "file_path": anonymizer.path(input_data.get("file_path", "")), + "content": anonymizer.text(input_data.get("content", "")), + } + if name == "bash": + cmd, _ = redact_text(input_data.get("command", "")) + return {"command": anonymizer.text(cmd)} + if name == "grep": + pattern, _ = redact_text(input_data.get("pattern", "")) + return {"pattern": anonymizer.text(pattern), "path": anonymizer.path(input_data.get("path", ""))} + if name == "glob": + return {"pattern": input_data.get("pattern", ""), "path": anonymizer.path(input_data.get("path", ""))} + if name == "task": + return {"prompt": anonymizer.text(input_data.get("prompt", ""))} + if name == "websearch": + return {"query": anonymizer.text(input_data.get("query", ""))} + if name == "webfetch": + return {"url": anonymizer.text(input_data.get("url", ""))} + if name == "apply_patch": + return {"patch": anonymizer.text(input_data.get("patchText", ""))} + if name == "codesearch": + return {"query": anonymizer.text(input_data.get("query", ""))} + + # Codex tools + if name == "exec_command": + cmd, _ = redact_text(input_data.get("cmd", "")) + return {"cmd": anonymizer.text(cmd)} + if name == "shell_command": + cmd, _ = redact_text(input_data.get("command", "")) + return { + "command": anonymizer.text(cmd), + "workdir": anonymizer.path(input_data.get("workdir", "")), + } + if name == "write_stdin": + return { + "session_id": input_data.get("session_id"), + "chars": anonymizer.text(input_data.get("chars", "")), + "yield_time_ms": input_data.get("yield_time_ms"), + "max_output_tokens": input_data.get("max_output_tokens"), + } + if name == "update_plan": + plan = input_data.get("plan", []) + return { + "explanation": anonymizer.text(input_data.get("explanation", "")), + "plan": [anonymizer.text(str(p)) if isinstance(p, str) else p for p in plan], + } + + # Fallback: anonymize all string values + return {k: anonymizer.text(str(v)) if isinstance(v, str) else v for k, v in input_data.items()} def _normalize_timestamp(value) -> str | None: if value is None: diff --git a/dataclaw/secrets.py b/dataclaw/secrets.py index 5624687..00ace97 100644 --- a/dataclaw/secrets.py +++ b/dataclaw/secrets.py @@ -2,6 +2,7 @@ import math import re +from typing import Any REDACTED = "[REDACTED]" @@ -225,6 +226,32 @@ def redact_custom_strings(text: str, strings: list[str]) -> tuple[str, int]: return text, count +def _redact_value(value: Any, custom_strings: list[str] | None = None) -> tuple[Any, int]: + """Recursively redact secrets from a string, list, or dict value.""" + if isinstance(value, str): + result, count = redact_text(value) + if custom_strings: + result, n = redact_custom_strings(result, custom_strings) + count += n + return result, count + if isinstance(value, dict): + total = 0 + out = {} + for k, v in value.items(): + out[k], n = _redact_value(v, custom_strings) + total += n + return out, total + if isinstance(value, list): + total = 0 + out_list = [] + for item in value: + redacted, n = _redact_value(item, custom_strings) + out_list.append(redacted) + total += n + return out_list, total + return value, 0 + + def redact_session(session: dict, custom_strings: list[str] | None = None) -> tuple[dict, int]: """Redact all secrets in a session dict. Returns (redacted_session, total_redactions).""" total = 0 @@ -238,11 +265,9 @@ def redact_session(session: dict, custom_strings: list[str] | None = None) -> tu msg[field], count = redact_custom_strings(msg[field], custom_strings) total += count for tool_use in msg.get("tool_uses", []): - if tool_use.get("input"): - tool_use["input"], count = redact_text(tool_use["input"]) - total += count - if custom_strings: - tool_use["input"], count = redact_custom_strings(tool_use["input"], custom_strings) + for field in ("input", "output"): + if tool_use.get(field): + tool_use[field], count = _redact_value(tool_use[field], custom_strings) total += count return session, total diff --git a/tests/test_parser.py b/tests/test_parser.py index 0f0e506..a128751 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -7,14 +7,16 @@ from dataclaw.parser import ( _build_project_name, + _build_tool_result_map, + _build_codex_tool_result_map, _extract_assistant_content, _extract_user_content, _find_subagent_only_sessions, _normalize_timestamp, _parse_session_file, _parse_subagent_session, + _parse_tool_input, _process_entry, - _summarize_tool_input, discover_projects, parse_project_sessions, _parse_codex_session_file, @@ -101,75 +103,107 @@ def test_other_type_returns_none(self): assert _normalize_timestamp({"ts": 123}) is None -# --- _summarize_tool_input --- +# --- _parse_tool_input --- -class TestSummarizeToolInput: +class TestParseToolInput: def test_read_tool(self, mock_anonymizer): - result = _summarize_tool_input("Read", {"file_path": "/tmp/test.py"}, mock_anonymizer) - assert "test.py" in result + result = _parse_tool_input("Read", {"file_path": "/tmp/test.py"}, mock_anonymizer) + assert isinstance(result, dict) + assert "file_path" in result + assert "test.py" in result["file_path"] def test_write_tool(self, mock_anonymizer): - result = _summarize_tool_input( + result = _parse_tool_input( "Write", {"file_path": "/tmp/test.py", "content": "abc"}, mock_anonymizer, ) - assert "test.py" in result - assert "3 chars" in result + assert isinstance(result, dict) + assert "file_path" in result + assert "content" in result def test_bash_tool(self, mock_anonymizer): - result = _summarize_tool_input("Bash", {"command": "ls -la"}, mock_anonymizer) - assert "ls -la" in result + result = _parse_tool_input("Bash", {"command": "ls -la"}, mock_anonymizer) + assert isinstance(result, dict) + assert result["command"] == "ls -la" def test_grep_tool(self, mock_anonymizer): - result = _summarize_tool_input( + result = _parse_tool_input( "Grep", {"pattern": "TODO", "path": "/tmp"}, mock_anonymizer, ) - assert "pattern=" in result - assert "path=" in result + assert isinstance(result, dict) + assert "pattern" in result + assert "path" in result def test_glob_tool(self, mock_anonymizer): - result = _summarize_tool_input( + result = _parse_tool_input( "Glob", {"pattern": "*.py", "path": "/tmp"}, mock_anonymizer, ) - assert "pattern=" in result + assert isinstance(result, dict) + assert result["pattern"] == "*.py" def test_task_tool(self, mock_anonymizer): - result = _summarize_tool_input( + result = _parse_tool_input( "Task", {"prompt": "Search for bugs"}, mock_anonymizer, ) - assert "Search for bugs" in result + assert isinstance(result, dict) + assert "Search for bugs" in result["prompt"] def test_websearch_tool(self, mock_anonymizer): - result = _summarize_tool_input( + result = _parse_tool_input( "WebSearch", {"query": "python async"}, mock_anonymizer, ) - assert "python async" in result + assert isinstance(result, dict) + assert result["query"] == "python async" def test_webfetch_tool(self, mock_anonymizer): - result = _summarize_tool_input( + result = _parse_tool_input( "WebFetch", {"url": "https://example.com"}, mock_anonymizer, ) - assert "https://example.com" in result - - def test_unknown_tool(self, mock_anonymizer): - result = _summarize_tool_input( - "CustomTool", {"foo": "bar"}, mock_anonymizer, - ) - assert "foo" in result or "bar" in result + assert isinstance(result, dict) + assert result["url"] == "https://example.com" def test_edit_tool(self, mock_anonymizer): - result = _summarize_tool_input( + result = _parse_tool_input( "Edit", {"file_path": "/tmp/test.py"}, mock_anonymizer, ) - assert "test.py" in result + assert isinstance(result, dict) + assert "file_path" in result + + def test_exec_command_tool(self, mock_anonymizer): + result = _parse_tool_input("exec_command", {"cmd": "ls -la"}, mock_anonymizer) + assert isinstance(result, dict) + assert result["cmd"] == "ls -la" + + def test_shell_command_tool(self, mock_anonymizer): + result = _parse_tool_input( + "shell_command", {"command": "ls", "workdir": "/tmp"}, mock_anonymizer, + ) + assert isinstance(result, dict) + assert result["command"] == "ls" + assert "workdir" in result + + def test_update_plan_tool(self, mock_anonymizer): + result = _parse_tool_input( + "update_plan", + {"explanation": "New plan", "plan": [{"step": "do it", "status": "pending"}]}, + mock_anonymizer, + ) + assert isinstance(result, dict) + assert "explanation" in result + assert "plan" in result + + def test_unknown_tool(self, mock_anonymizer): + result = _parse_tool_input("CustomTool", {"foo": "bar"}, mock_anonymizer) + assert isinstance(result, dict) def test_none_tool_name(self, mock_anonymizer): - result = _summarize_tool_input(None, {"data": "value"}, mock_anonymizer) - assert isinstance(result, str) + result = _parse_tool_input(None, {"data": "value"}, mock_anonymizer) + assert isinstance(result, dict) def test_non_dict_input(self, mock_anonymizer): - result = _summarize_tool_input("Read", "just a string", mock_anonymizer) - assert isinstance(result, str) + result = _parse_tool_input("Read", "just a string", mock_anonymizer) + assert isinstance(result, dict) + assert "raw" in result # --- _extract_user_content --- @@ -1031,3 +1065,262 @@ def test_parse_includes_subagent_sessions(self, tmp_path, monkeypatch, mock_anon contents = {s["messages"][0]["content"] for s in sessions} assert "Root msg" in contents assert "SA msg" in contents + + +# --- _build_tool_result_map (Claude tool outputs) --- + + +class TestBuildToolResultMap: + def test_basic_string_output(self, mock_anonymizer): + entries = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tu-1", + "content": "file contents here", + "is_error": False, + } + ] + }, + } + ] + result = _build_tool_result_map(entries, mock_anonymizer) + assert "tu-1" in result + assert result["tu-1"]["status"] == "success" + assert result["tu-1"]["output"]["text"] == "file contents here" + + def test_error_result(self, mock_anonymizer): + entries = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tu-2", + "content": "Permission denied", + "is_error": True, + } + ] + }, + } + ] + result = _build_tool_result_map(entries, mock_anonymizer) + assert result["tu-2"]["status"] == "error" + + def test_list_content(self, mock_anonymizer): + entries = [ + { + "type": "user", + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tu-3", + "content": [ + {"type": "text", "text": "Part one"}, + {"type": "text", "text": "Part two"}, + ], + } + ] + }, + } + ] + result = _build_tool_result_map(entries, mock_anonymizer) + assert "Part one" in result["tu-3"]["output"]["text"] + assert "Part two" in result["tu-3"]["output"]["text"] + + def test_empty_content_gives_empty_output(self, mock_anonymizer): + entries = [ + { + "type": "user", + "message": { + "content": [ + {"type": "tool_result", "tool_use_id": "tu-4", "content": ""} + ] + }, + } + ] + result = _build_tool_result_map(entries, mock_anonymizer) + assert result["tu-4"]["output"] == {} + + def test_non_user_entries_ignored(self, mock_anonymizer): + entries = [ + { + "type": "assistant", + "message": { + "content": [ + {"type": "tool_result", "tool_use_id": "tu-5", "content": "ignored"} + ] + }, + } + ] + result = _build_tool_result_map(entries, mock_anonymizer) + assert "tu-5" not in result + + def test_tool_output_attached_in_session(self, tmp_path, mock_anonymizer): + """End-to-end: tool_use in assistant entry gets output from tool_result in user entry.""" + f = tmp_path / "session.jsonl" + entries = [ + { + "type": "assistant", + "timestamp": 1706000001000, + "message": { + "model": "claude-sonnet", + "content": [ + { + "type": "tool_use", + "id": "tu-abc", + "name": "Bash", + "input": {"command": "ls"}, + } + ], + "usage": {"input_tokens": 10, "output_tokens": 5}, + }, + }, + { + "type": "user", + "timestamp": 1706000002000, + "message": { + "content": [ + { + "type": "tool_result", + "tool_use_id": "tu-abc", + "content": "file1.py\nfile2.py", + "is_error": False, + } + ] + }, + }, + ] + f.write_text("\n".join(json.dumps(e) for e in entries) + "\n") + result = _parse_session_file(f, mock_anonymizer) + assert result is not None + tu = result["messages"][0]["tool_uses"][0] + assert tu["tool"] == "Bash" + assert tu["status"] == "success" + assert "file1.py" in tu["output"]["text"] + + +# --- _build_codex_tool_result_map --- + + +class TestBuildCodexToolResultMap: + def test_function_call_output(self, mock_anonymizer): + entries = [ + { + "type": "response_item", + "payload": { + "type": "function_call_output", + "call_id": "call-1", + "output": "Exit code: 0\nWall time: 1 seconds\nOutput:\nhello world\n", + }, + } + ] + result = _build_codex_tool_result_map(entries, mock_anonymizer) + assert "call-1" in result + assert result["call-1"]["status"] == "success" + assert result["call-1"]["output"]["exit_code"] == 0 + assert result["call-1"]["output"]["wall_time"] == "1 seconds" + assert "hello world" in result["call-1"]["output"]["output"] + + def test_custom_tool_call_output(self, mock_anonymizer): + import json as _json + entries = [ + { + "type": "response_item", + "payload": { + "type": "custom_tool_call_output", + "call_id": "call-2", + "output": _json.dumps({ + "output": "Successfully applied patch", + "metadata": {"exit_code": 0, "duration_seconds": 0.5}, + }), + }, + } + ] + result = _build_codex_tool_result_map(entries, mock_anonymizer) + assert "call-2" in result + assert result["call-2"]["output"]["exit_code"] == 0 + assert "Successfully applied patch" in result["call-2"]["output"]["output"] + assert result["call-2"]["output"]["duration_seconds"] == 0.5 + + def test_non_response_item_ignored(self, mock_anonymizer): + entries = [ + { + "type": "event_msg", + "payload": { + "type": "function_call_output", + "call_id": "call-3", + "output": "ignored", + }, + } + ] + result = _build_codex_tool_result_map(entries, mock_anonymizer) + assert "call-3" not in result + + def test_output_attached_end_to_end(self, tmp_path, monkeypatch, mock_anonymizer): + """Codex tool output is attached to the tool_use in the parsed session.""" + import json as _json + monkeypatch.setattr("dataclaw.parser.PROJECTS_DIR", tmp_path / "no-claude") + monkeypatch.setattr("dataclaw.parser._CODEX_PROJECT_INDEX", {}) + + codex_sessions = tmp_path / "codex-sessions" / "2026" / "02" / "24" + codex_sessions.mkdir(parents=True) + session_file = codex_sessions / "rollout-1.jsonl" + lines = [ + { + "timestamp": "2026-02-24T16:09:59.567Z", + "type": "session_meta", + "payload": {"id": "s1", "cwd": "/home/user/repo", "model_provider": "openai"}, + }, + { + "timestamp": "2026-02-24T16:10:00.000Z", + "type": "event_msg", + "payload": {"type": "user_message", "message": "run ls"}, + }, + { + "timestamp": "2026-02-24T16:10:00.100Z", + "type": "response_item", + "payload": { + "type": "function_call", + "name": "shell_command", + "call_id": "call-x", + "arguments": _json.dumps({"command": "ls", "workdir": "/home/user/repo"}), + }, + }, + { + "timestamp": "2026-02-24T16:10:00.200Z", + "type": "response_item", + "payload": { + "type": "function_call_output", + "call_id": "call-x", + "output": "Exit code: 0\nWall time: 0 seconds\nOutput:\nfoo.py\nbar.py\n", + }, + }, + { + "timestamp": "2026-02-24T16:10:01.000Z", + "type": "event_msg", + "payload": {"type": "agent_message", "message": "Done."}, + }, + ] + session_file.write_text("\n".join(_json.dumps(l) for l in lines) + "\n") + + monkeypatch.setattr("dataclaw.parser.CODEX_SESSIONS_DIR", tmp_path / "codex-sessions") + monkeypatch.setattr("dataclaw.parser.CODEX_ARCHIVED_DIR", tmp_path / "codex-archived") + + result = _parse_codex_session_file( + session_file, mock_anonymizer, include_thinking=True, + target_cwd="/home/user/repo", + ) + assert result is not None + assistant_msgs = [m for m in result["messages"] if m["role"] == "assistant"] + assert len(assistant_msgs) == 1 + tu = assistant_msgs[0]["tool_uses"][0] + assert tu["tool"] == "shell_command" + assert tu["status"] == "success" + assert tu["output"]["exit_code"] == 0 + assert "foo.py" in tu["output"]["output"]