Skip to content

Commit 7254213

Browse files
author
Crovia Autopublish
committed
refactor(hubble): separate core generators, open verifiers, and tooling
1 parent c5d5bfb commit 7254213

9 files changed

Lines changed: 541 additions & 58 deletions

hubble/bin/README.md

Lines changed: 11 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,62 +1,15 @@
1-
# Crovia — Global AI Training Omissions (Hubble)
1+
# Crovia Hubble · Open Engine Layout
22

3-
Public, verifiable observation of **missing training evidence**
4-
across widely used AI training datasets.
3+
This directory is intentionally structured.
54

6-
Crovia publishes **neutral evidence signals** derived from
7-
publicly observable artifacts. No claims, no accusations.
5+
## core/
6+
Deterministic generators of public evidence.
7+
These scripts produce the artifacts published in the Open Plane.
88

9-
---
9+
## verify/
10+
Open verification & explanation tools.
11+
These scripts allow third parties to understand and reproduce results.
1012

11-
🧾 **Hubble Ledger Status (public)**
12-
13-
_Auto-generated from public artifacts on each run._
14-
15-
Last evidence update: `2025-12-18T20:37:53Z`
16-
Absence receipts (7d): `18`
17-
Presence signals: `19`
18-
19-
**Top omissions (from ranking):**
20-
- `NEC#2` · avg_persistence_days=`1.0` · score=`76.9393`
21-
- `NEC#13` · avg_persistence_days=`1.0` · score=`74.8599`
22-
- `NEC#10` · avg_persistence_days=`1.0` · score=`31.8848`
23-
24-
Source of truth: `EVIDENCE.json` · `open/forensic/*`
25-
Engine (open-grade): `open/forensic/hubble_continuum.py`
26-
27-
---
28-
29-
🧠 **PRO shadow signals**
30-
31-
Some pressure indicators shown in this dataset are computed using
32-
internal heuristics **not included in the open engine**.
33-
34-
They are exposed only as:
35-
- comparative signals
36-
- consistency checks
37-
- early-warning pressure metrics
38-
39-
_No proprietary logic or methods are disclosed._
40-
41-
---
42-
43-
## What Crovia does
44-
45-
- Observes **absence of declared training evidence**
46-
- Tracks persistence over time
47-
- Publishes verifiable, neutral signals
48-
49-
## What Crovia does NOT do
50-
51-
- Does not audit models
52-
- Does not infer intent
53-
- Does not make legal or compliance claims
54-
- Does not accuse any organization
55-
56-
## Start here
57-
58-
- 📊 Rankings & signals → `open/`
59-
- 🧪 Evidence & forensic trails → `open/forensic/`
60-
- 🕒 Temporal pressure metrics → `open/temporal/`
61-
62-
This dataset is an **observation plane**, not a verdict engine.
13+
## tools/
14+
Operational utilities (publishing, badges, helpers).
15+
They do not define evidence or truth.
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#!/usr/bin/env python3
2+
import json
3+
from pathlib import Path
4+
from datetime import datetime
5+
6+
BASE = Path(__file__).resolve().parent.parent
7+
LEDGER = BASE / "data" / "ledger" / "ledger.ndjson"
8+
OUT = BASE / "data" / "metrics" / "persistence.ndjson"
9+
10+
def parse_day(s: str):
11+
# accetta YYYY-MM-DD, YYYY-MM-DDx, YYYY-MM-DD-anything
12+
base = s[:10]
13+
return datetime.strptime(base, "%Y-%m-%d")
14+
15+
def main():
16+
states = {}
17+
18+
with open(LEDGER, "r", encoding="utf-8") as f:
19+
for line in f:
20+
if not line.strip():
21+
continue
22+
obj = json.loads(line)
23+
rid = obj.get("record_id")
24+
if not rid:
25+
continue
26+
27+
schema = obj.get("schema")
28+
29+
if schema == "crovia_necessity_state.v1":
30+
states[rid] = {
31+
"record_id": rid,
32+
"source_id": obj.get("source_id"),
33+
"necessity_id": obj.get("necessity_id"),
34+
"first_seen": obj.get("first_seen"),
35+
"last_seen": obj.get("last_seen")
36+
}
37+
38+
elif schema == "crovia_necessity_state_update.v1":
39+
if rid in states:
40+
states[rid]["last_seen"] = obj.get("last_seen")
41+
42+
OUT.parent.mkdir(parents=True, exist_ok=True)
43+
44+
with open(OUT, "w", encoding="utf-8") as out:
45+
for s in states.values():
46+
fs = parse_day(s["first_seen"])
47+
ls = parse_day(s["last_seen"])
48+
days = (ls - fs).days + 1
49+
50+
out.write(json.dumps({
51+
"record_id": s["record_id"],
52+
"source_id": s["source_id"],
53+
"necessity_id": s["necessity_id"],
54+
"days_persistent": days,
55+
"from": s["first_seen"],
56+
"to": s["last_seen"]
57+
}) + "\n")
58+
59+
print(f"Wrote persistence for {len(states)} records → {OUT}")
60+
61+
if __name__ == "__main__":
62+
main()

hubble/bin/core/make_snapshot.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/usr/bin/env python3
2+
import json, hashlib
3+
from pathlib import Path
4+
from datetime import datetime, timezone
5+
6+
BASE = Path(__file__).resolve().parent.parent
7+
LEDGER = BASE / "data" / "ledger" / "ledger.ndjson"
8+
PERSIST = BASE / "data" / "metrics" / "persistence.ndjson"
9+
SNAPDIR = BASE / "data" / "snapshots"
10+
11+
SNAPDIR.mkdir(parents=True, exist_ok=True)
12+
13+
def sha256(path):
14+
h = hashlib.sha256()
15+
with open(path, "rb") as f:
16+
for b in iter(lambda: f.read(8192), b""):
17+
h.update(b)
18+
return h.hexdigest()
19+
20+
def main():
21+
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d")
22+
snap = SNAPDIR / f"snapshot_{ts}.json"
23+
24+
data = {
25+
"date": ts,
26+
"ledger_sha256": sha256(LEDGER),
27+
"persistence_sha256": sha256(PERSIST),
28+
"records": sum(1 for _ in open(LEDGER)),
29+
"persistence_records": sum(1 for _ in open(PERSIST))
30+
}
31+
32+
snap.write_text(json.dumps(data, indent=2))
33+
print(f"Snapshot written → {snap}")
34+
35+
if __name__ == "__main__":
36+
main()

hubble/bin/core/rank_omissions.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env python3
2+
import json
3+
from pathlib import Path
4+
from collections import defaultdict
5+
from math import log
6+
7+
BASE = Path("/opt/crovia/hubble")
8+
PERSIST = BASE / "data" / "metrics" / "persistence.ndjson"
9+
OUT = BASE / "data" / "metrics" / "global_ranking.json"
10+
11+
def main():
12+
stats = defaultdict(lambda: {"records": 0, "sum_days": 0})
13+
14+
for line in PERSIST.read_text().splitlines():
15+
r = json.loads(line)
16+
nid = r["necessity_id"]
17+
d = r["days_persistent"]
18+
stats[nid]["records"] += 1
19+
stats[nid]["sum_days"] += d
20+
21+
ranking = []
22+
for nid, s in stats.items():
23+
avg = s["sum_days"] / s["records"]
24+
score = s["records"] * log(1 + avg)
25+
ranking.append({
26+
"necessity_id": nid,
27+
"records": s["records"],
28+
"avg_persistence_days": round(avg, 2),
29+
"score": round(score, 4)
30+
})
31+
32+
ranking.sort(key=lambda x: x["score"], reverse=True)
33+
34+
OUT.write_text(json.dumps({
35+
"generated_at": OUT.stat().st_mtime if OUT.exists() else None,
36+
"ranking": ranking
37+
}, indent=2))
38+
39+
print(f"Wrote ranking → {OUT}")
40+
41+
if __name__ == "__main__":
42+
main()
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/env python3
2+
import requests, time, yaml
3+
4+
OUT = "canon/sources.auto.yaml"
5+
LIMIT = 120
6+
SLEEP = 0.4
7+
8+
def has_readme(dataset_id):
9+
url = f"https://huggingface.co/datasets/{dataset_id}/raw/main/README.md"
10+
r = requests.get(url, timeout=15)
11+
return r.status_code == 200 and len(r.text) > 50
12+
13+
def main():
14+
api = "https://huggingface.co/api/datasets"
15+
r = requests.get(api, timeout=30)
16+
r.raise_for_status()
17+
data = r.json()
18+
19+
# ordina per download se presente, altrimenti lascia ordine API
20+
data.sort(key=lambda d: d.get("downloads", 0), reverse=True)
21+
22+
sources = []
23+
for d in data:
24+
if len(sources) >= LIMIT:
25+
break
26+
did = d.get("id")
27+
if not did:
28+
continue
29+
try:
30+
if not has_readme(did):
31+
continue
32+
except Exception:
33+
continue
34+
35+
sources.append({
36+
"id": "hf_" + did.replace("/", "_"),
37+
"url": "https://huggingface.co/datasets/" + did
38+
})
39+
time.sleep(SLEEP)
40+
41+
with open(OUT, "w") as f:
42+
yaml.dump({"sources": sources}, f, sort_keys=False)
43+
44+
print(f"Generated {len(sources)} HF sources → {OUT}")
45+
46+
if __name__ == "__main__":
47+
main()

0 commit comments

Comments
 (0)