Skip to content

Commit 520b301

Browse files
peteromalletclaude
andcommitted
Add custom source for user-provided JSONL, bump to v0.3.1
Users can drop pre-formatted JSONL files into ~/.dataclaw/custom/<project>/ and DataClaw picks them up alongside all other sources. Sessions are validated for required fields and redacted through the standard pipeline. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 401b64f commit 520b301

5 files changed

Lines changed: 219 additions & 8 deletions

File tree

dataclaw/cli.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
from .anonymizer import Anonymizer
1414
from .config import CONFIG_FILE, DataClawConfig, load_config, save_config
15-
from .parser import CLAUDE_DIR, CODEX_DIR, GEMINI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions
15+
from .parser import CLAUDE_DIR, CODEX_DIR, CUSTOM_DIR, GEMINI_DIR, OPENCODE_DIR, OPENCLAW_DIR, discover_projects, parse_project_sessions
1616
from .secrets import _has_mixed_char_types, _shannon_entropy, redact_session
1717

1818
HF_TAG = "dataclaw"
@@ -58,8 +58,8 @@
5858
"Step 6/6: After explicit user approval, publish: dataclaw export --publish-attestation \"User explicitly approved publishing to Hugging Face.\"",
5959
]
6060

61-
EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "gemini", "opencode", "openclaw", "all", "both"}
62-
SOURCE_CHOICES = ["auto", "claude", "codex", "gemini", "opencode", "openclaw", "all"]
61+
EXPLICIT_SOURCE_CHOICES = {"claude", "codex", "custom", "gemini", "opencode", "openclaw", "all", "both"}
62+
SOURCE_CHOICES = ["auto", "claude", "codex", "custom", "gemini", "opencode", "openclaw", "all"]
6363

6464

6565
def _mask_secret(s: str) -> str:
@@ -89,7 +89,9 @@ def _source_label(source_filter: str) -> str:
8989
return "OpenCode"
9090
if source_filter == "openclaw":
9191
return "OpenClaw"
92-
return "Claude Code, Codex, Gemini CLI, OpenCode, or OpenClaw"
92+
if source_filter == "custom":
93+
return "Custom"
94+
return "Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, or Custom"
9395

9496

9597
def _normalize_source_filter(source_filter: str) -> str:
@@ -133,7 +135,9 @@ def _has_session_sources(source_filter: str = "auto") -> bool:
133135
return OPENCODE_DIR.exists()
134136
if source_filter == "openclaw":
135137
return OPENCLAW_DIR.exists()
136-
return CLAUDE_DIR.exists() or CODEX_DIR.exists() or GEMINI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists()
138+
if source_filter == "custom":
139+
return CUSTOM_DIR.exists()
140+
return CLAUDE_DIR.exists() or CODEX_DIR.exists() or CUSTOM_DIR.exists() or GEMINI_DIR.exists() or OPENCODE_DIR.exists() or OPENCLAW_DIR.exists()
137141

138142

139143
def _filter_projects_by_source(projects: list[dict], source_filter: str) -> list[dict]:

dataclaw/parser.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
GEMINI_SOURCE = "gemini"
2020
OPENCODE_SOURCE = "opencode"
2121
OPENCLAW_SOURCE = "openclaw"
22+
CUSTOM_SOURCE = "custom"
2223

2324
CLAUDE_DIR = Path.home() / ".claude"
2425
PROJECTS_DIR = CLAUDE_DIR / "projects"
@@ -38,6 +39,8 @@
3839
OPENCLAW_AGENTS_DIR = OPENCLAW_DIR / "agents"
3940
UNKNOWN_OPENCLAW_CWD = "<unknown-cwd>"
4041

42+
CUSTOM_DIR = Path.home() / ".dataclaw" / "custom"
43+
4144
_CODEX_PROJECT_INDEX: dict[str, list[Path]] = {}
4245
_GEMINI_HASH_MAP: dict[str, str] = {}
4346
_OPENCODE_PROJECT_INDEX: dict[str, list[str]] = {}
@@ -129,6 +132,7 @@ def discover_projects() -> list[dict]:
129132
projects.extend(_discover_gemini_projects())
130133
projects.extend(_discover_opencode_projects())
131134
projects.extend(_discover_openclaw_projects())
135+
projects.extend(_discover_custom_projects())
132136
return sorted(projects, key=lambda p: (p["display_name"], p["source"]))
133137

134138

@@ -246,13 +250,99 @@ def _discover_openclaw_projects() -> list[dict]:
246250
return projects
247251

248252

253+
def _discover_custom_projects() -> list[dict]:
254+
if not CUSTOM_DIR.exists():
255+
return []
256+
257+
projects = []
258+
for project_dir in sorted(CUSTOM_DIR.iterdir()):
259+
if not project_dir.is_dir():
260+
continue
261+
jsonl_files = list(project_dir.glob("*.jsonl"))
262+
if not jsonl_files:
263+
continue
264+
session_count = 0
265+
total_size = 0
266+
for f in jsonl_files:
267+
total_size += f.stat().st_size
268+
try:
269+
session_count += sum(1 for line in f.open() if line.strip())
270+
except OSError:
271+
pass
272+
if session_count == 0:
273+
continue
274+
projects.append(
275+
{
276+
"dir_name": project_dir.name,
277+
"display_name": f"custom:{project_dir.name}",
278+
"session_count": session_count,
279+
"total_size_bytes": total_size,
280+
"source": CUSTOM_SOURCE,
281+
}
282+
)
283+
return projects
284+
285+
286+
def _parse_custom_sessions(
287+
project_dir_name: str,
288+
anonymizer: Anonymizer,
289+
) -> list[dict]:
290+
project_path = CUSTOM_DIR / project_dir_name
291+
if not project_path.exists():
292+
return []
293+
294+
required_fields = {"session_id", "model", "messages"}
295+
sessions = []
296+
for jsonl_file in sorted(project_path.glob("*.jsonl")):
297+
try:
298+
for line_num, line in enumerate(jsonl_file.open(), 1):
299+
line = line.strip()
300+
if not line:
301+
continue
302+
try:
303+
session = json.loads(line)
304+
except json.JSONDecodeError:
305+
logger.warning(
306+
"custom:%s: %s line %d: invalid JSON, skipping",
307+
project_dir_name, jsonl_file.name, line_num,
308+
)
309+
continue
310+
if not isinstance(session, dict):
311+
logger.warning(
312+
"custom:%s: %s line %d: not a JSON object, skipping",
313+
project_dir_name, jsonl_file.name, line_num,
314+
)
315+
continue
316+
missing = required_fields - session.keys()
317+
if missing:
318+
logger.warning(
319+
"custom:%s: %s line %d: missing required fields %s, skipping",
320+
project_dir_name, jsonl_file.name, line_num, sorted(missing),
321+
)
322+
continue
323+
session["project"] = f"custom:{project_dir_name}"
324+
session["source"] = CUSTOM_SOURCE
325+
# Redact message content through the anonymizer
326+
for msg in session.get("messages", []):
327+
if "content" in msg and isinstance(msg["content"], str):
328+
redacted, _ = redact_text(msg["content"])
329+
msg["content"] = anonymizer.text(redacted)
330+
sessions.append(session)
331+
except OSError:
332+
logger.warning("custom:%s: failed to read %s", project_dir_name, jsonl_file.name)
333+
return sessions
334+
335+
249336
def parse_project_sessions(
250337
project_dir_name: str,
251338
anonymizer: Anonymizer,
252339
include_thinking: bool = True,
253340
source: str = CLAUDE_SOURCE,
254341
) -> list[dict]:
255342
"""Parse all sessions for a project into structured dicts."""
343+
if source == CUSTOM_SOURCE:
344+
return _parse_custom_sessions(project_dir_name, anonymizer)
345+
256346
if source == OPENCLAW_SOURCE:
257347
index = _get_openclaw_project_index()
258348
session_files = index.get(project_dir_name, [])

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "dataclaw"
7-
version = "0.3.0"
7+
version = "0.3.1"
88
description = "Export your coding agent conversations to Hugging Face as structured data"
99
requires-python = ">=3.10"
1010
license = "MIT"

tests/test_cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,7 +400,7 @@ def test_no_projects(self, monkeypatch, capsys):
400400
monkeypatch.setattr("dataclaw.cli.discover_projects", lambda: [])
401401
list_projects()
402402
captured = capsys.readouterr()
403-
assert "No Claude Code, Codex, Gemini CLI, OpenCode, or OpenClaw sessions" in captured.out
403+
assert "No Claude Code, Codex, Gemini CLI, OpenCode, OpenClaw, or Custom sessions" in captured.out
404404

405405
def test_source_filter_codex(self, monkeypatch, capsys):
406406
monkeypatch.setattr(
@@ -626,7 +626,7 @@ def test_export_requires_explicit_source_selection(self, monkeypatch, capsys):
626626
assert payload["error"] == "Source scope is not confirmed yet."
627627
assert payload["blocked_on_step"] == "Step 2/6"
628628
assert len(payload["process_steps"]) == 6
629-
assert payload["allowed_sources"] == ["all", "both", "claude", "codex", "gemini", "openclaw", "opencode"]
629+
assert payload["allowed_sources"] == ["all", "both", "claude", "codex", "custom", "gemini", "openclaw", "opencode"]
630630
assert payload["next_command"] == "dataclaw config --source all"
631631

632632
def test_configure_next_steps_require_full_folder_presentation(self):

tests/test_parser.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,7 @@ def _disable_codex(self, tmp_path, monkeypatch):
437437
monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {})
438438
monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents")
439439
monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {})
440+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom")
440441

441442
def _write_opencode_db(self, db_path):
442443
conn = sqlite3.connect(db_path)
@@ -987,6 +988,7 @@ def _disable_codex(self, tmp_path, monkeypatch):
987988
monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {})
988989
monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents")
989990
monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {})
991+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom")
990992

991993
def test_discover_includes_subagent_sessions(self, tmp_path, monkeypatch, mock_anonymizer):
992994
self._disable_codex(tmp_path, monkeypatch)
@@ -1585,6 +1587,7 @@ def _disable_others(self, tmp_path, monkeypatch):
15851587
monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db")
15861588
monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {})
15871589
monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {})
1590+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "no-custom")
15881591

15891592
def test_discover_openclaw_projects(self, tmp_path, monkeypatch, mock_anonymizer):
15901593
self._disable_others(tmp_path, monkeypatch)
@@ -1657,3 +1660,117 @@ def test_multiple_agents_same_cwd(self, tmp_path, monkeypatch, mock_anonymizer):
16571660
projects = discover_projects()
16581661
assert len(projects) == 1
16591662
assert projects[0]["session_count"] == 2
1663+
1664+
1665+
class TestDiscoverCustomProjects:
1666+
def _disable_others(self, tmp_path, monkeypatch):
1667+
monkeypatch.setattr("dataclaw.parser.PROJECTS_DIR", tmp_path / "no-claude")
1668+
monkeypatch.setattr("dataclaw.parser.CODEX_SESSIONS_DIR", tmp_path / "no-codex-sessions")
1669+
monkeypatch.setattr("dataclaw.parser.CODEX_ARCHIVED_DIR", tmp_path / "no-codex-archived")
1670+
monkeypatch.setattr("dataclaw.parser._CODEX_PROJECT_INDEX", {})
1671+
monkeypatch.setattr("dataclaw.parser.GEMINI_DIR", tmp_path / "no-gemini")
1672+
monkeypatch.setattr("dataclaw.parser.OPENCODE_DB_PATH", tmp_path / "no-opencode.db")
1673+
monkeypatch.setattr("dataclaw.parser._OPENCODE_PROJECT_INDEX", {})
1674+
monkeypatch.setattr("dataclaw.parser.OPENCLAW_AGENTS_DIR", tmp_path / "no-openclaw-agents")
1675+
monkeypatch.setattr("dataclaw.parser._OPENCLAW_PROJECT_INDEX", {})
1676+
1677+
def _make_valid_session(self, session_id="s1", model="gpt-4", content="hello"):
1678+
return json.dumps({
1679+
"session_id": session_id,
1680+
"model": model,
1681+
"messages": [
1682+
{"role": "user", "content": content},
1683+
{"role": "assistant", "content": "hi there"},
1684+
],
1685+
"stats": {"user_messages": 1, "assistant_messages": 1, "tool_uses": 0,
1686+
"input_tokens": 10, "output_tokens": 5},
1687+
})
1688+
1689+
def test_discover_custom_projects(self, tmp_path, monkeypatch, mock_anonymizer):
1690+
self._disable_others(tmp_path, monkeypatch)
1691+
custom_dir = tmp_path / "custom"
1692+
proj = custom_dir / "my-project"
1693+
proj.mkdir(parents=True)
1694+
(proj / "sessions.jsonl").write_text(
1695+
self._make_valid_session("s1") + "\n" + self._make_valid_session("s2") + "\n"
1696+
)
1697+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", custom_dir)
1698+
projects = discover_projects()
1699+
assert len(projects) == 1
1700+
assert projects[0]["display_name"] == "custom:my-project"
1701+
assert projects[0]["session_count"] == 2
1702+
assert projects[0]["source"] == "custom"
1703+
1704+
def test_discover_skips_empty_dir(self, tmp_path, monkeypatch):
1705+
self._disable_others(tmp_path, monkeypatch)
1706+
custom_dir = tmp_path / "custom"
1707+
(custom_dir / "empty-project").mkdir(parents=True)
1708+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", custom_dir)
1709+
projects = discover_projects()
1710+
assert len(projects) == 0
1711+
1712+
def test_discover_missing_dir(self, tmp_path, monkeypatch):
1713+
self._disable_others(tmp_path, monkeypatch)
1714+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", tmp_path / "nonexistent")
1715+
projects = discover_projects()
1716+
assert len(projects) == 0
1717+
1718+
def test_parse_valid_sessions(self, tmp_path, monkeypatch, mock_anonymizer):
1719+
custom_dir = tmp_path / "custom"
1720+
proj = custom_dir / "test-proj"
1721+
proj.mkdir(parents=True)
1722+
(proj / "data.jsonl").write_text(
1723+
self._make_valid_session("s1") + "\n" + self._make_valid_session("s2", model="o1") + "\n"
1724+
)
1725+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", custom_dir)
1726+
sessions = parse_project_sessions("test-proj", mock_anonymizer, source="custom")
1727+
assert len(sessions) == 2
1728+
assert sessions[0]["session_id"] == "s1"
1729+
assert sessions[1]["model"] == "o1"
1730+
assert sessions[0]["project"] == "custom:test-proj"
1731+
assert sessions[0]["source"] == "custom"
1732+
1733+
def test_parse_skips_missing_fields(self, tmp_path, monkeypatch, mock_anonymizer):
1734+
custom_dir = tmp_path / "custom"
1735+
proj = custom_dir / "test-proj"
1736+
proj.mkdir(parents=True)
1737+
valid = self._make_valid_session("s1")
1738+
no_model = json.dumps({"session_id": "s2", "messages": []})
1739+
no_messages = json.dumps({"session_id": "s3", "model": "m"})
1740+
no_session_id = json.dumps({"model": "m", "messages": []})
1741+
(proj / "data.jsonl").write_text(
1742+
"\n".join([valid, no_model, no_messages, no_session_id]) + "\n"
1743+
)
1744+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", custom_dir)
1745+
sessions = parse_project_sessions("test-proj", mock_anonymizer, source="custom")
1746+
assert len(sessions) == 1
1747+
assert sessions[0]["session_id"] == "s1"
1748+
1749+
def test_parse_skips_invalid_json(self, tmp_path, monkeypatch, mock_anonymizer):
1750+
custom_dir = tmp_path / "custom"
1751+
proj = custom_dir / "test-proj"
1752+
proj.mkdir(parents=True)
1753+
valid = self._make_valid_session("s1")
1754+
(proj / "data.jsonl").write_text(valid + "\n" + "not-json\n")
1755+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", custom_dir)
1756+
sessions = parse_project_sessions("test-proj", mock_anonymizer, source="custom")
1757+
assert len(sessions) == 1
1758+
1759+
def test_parse_multiple_files(self, tmp_path, monkeypatch, mock_anonymizer):
1760+
custom_dir = tmp_path / "custom"
1761+
proj = custom_dir / "test-proj"
1762+
proj.mkdir(parents=True)
1763+
(proj / "a.jsonl").write_text(self._make_valid_session("s1") + "\n")
1764+
(proj / "b.jsonl").write_text(self._make_valid_session("s2") + "\n")
1765+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", custom_dir)
1766+
sessions = parse_project_sessions("test-proj", mock_anonymizer, source="custom")
1767+
assert len(sessions) == 2
1768+
ids = {s["session_id"] for s in sessions}
1769+
assert ids == {"s1", "s2"}
1770+
1771+
def test_parse_nonexistent_project(self, tmp_path, monkeypatch, mock_anonymizer):
1772+
custom_dir = tmp_path / "custom"
1773+
custom_dir.mkdir(parents=True)
1774+
monkeypatch.setattr("dataclaw.parser.CUSTOM_DIR", custom_dir)
1775+
sessions = parse_project_sessions("nope", mock_anonymizer, source="custom")
1776+
assert sessions == []

0 commit comments

Comments
 (0)