croviatrust
diff --git a/‎hubble/bin/README.md‎
Lines changed: 11 additions & 58 deletions b/‎hubble/bin/README.md‎
Lines changed: 11 additions & 58 deletions
diff --git a/‎hubble/bin/core/compute_persistence.py‎
Lines changed: 62 additions & 0 deletions b/‎hubble/bin/core/compute_persistence.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎hubble/bin/core/make_snapshot.py‎
Lines changed: 36 additions & 0 deletions b/‎hubble/bin/core/make_snapshot.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎hubble/bin/core/rank_omissions.py‎
Lines changed: 42 additions & 0 deletions b/‎hubble/bin/core/rank_omissions.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎hubble/bin/tools/hf_expand_sources.py‎
Lines changed: 47 additions & 0 deletions b/‎hubble/bin/tools/hf_expand_sources.py‎
Lines changed: 47 additions & 0 deletions
@@ -1,62 +1,15 @@
-# Crovia — Global AI Training Omissions (Hubble)
+# Crovia Hubble · Open Engine Layout
 
-Public, verifiable observation of **missing training evidence**
-across widely used AI training datasets.
+This directory is intentionally structured.
 
-Crovia publishes **neutral evidence signals** derived from
-publicly observable artifacts. No claims, no accusations.
+## core/
+Deterministic generators of public evidence.
+These scripts produce the artifacts published in the Open Plane.
 
----
+## verify/
+Open verification & explanation tools.
+These scripts allow third parties to understand and reproduce results.
 
-🧾 **Hubble Ledger Status (public)**
-
-_Auto-generated from public artifacts on each run._
-
-Last evidence update: `2025-12-18T20:37:53Z`  
-Absence receipts (7d): `18`  
-Presence signals: `19`
-
-**Top omissions (from ranking):**
-- `NEC#2` · avg_persistence_days=`1.0` · score=`76.9393`
-- `NEC#13` · avg_persistence_days=`1.0` · score=`74.8599`
-- `NEC#10` · avg_persistence_days=`1.0` · score=`31.8848`
-
-Source of truth: `EVIDENCE.json` · `open/forensic/*`  
-Engine (open-grade): `open/forensic/hubble_continuum.py`
-
----
-
-🧠 **PRO shadow signals**
-
-Some pressure indicators shown in this dataset are computed using
-internal heuristics **not included in the open engine**.
-
-They are exposed only as:
-- comparative signals
-- consistency checks
-- early-warning pressure metrics
-
-_No proprietary logic or methods are disclosed._
-
----
-
-## What Crovia does
-
-- Observes **absence of declared training evidence**
-- Tracks persistence over time
-- Publishes verifiable, neutral signals
-
-## What Crovia does NOT do
-
-- Does not audit models
-- Does not infer intent
-- Does not make legal or compliance claims
-- Does not accuse any organization
-
-## Start here
-
-- 📊 Rankings & signals → `open/`
-- 🧪 Evidence & forensic trails → `open/forensic/`
-- 🕒 Temporal pressure metrics → `open/temporal/`
-
-This dataset is an **observation plane**, not a verdict engine.
+## tools/
+Operational utilities (publishing, badges, helpers).
+They do not define evidence or truth.
@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+import json
+from pathlib import Path
+from datetime import datetime
+
+BASE = Path(__file__).resolve().parent.parent
+LEDGER = BASE / "data" / "ledger" / "ledger.ndjson"
+OUT = BASE / "data" / "metrics" / "persistence.ndjson"
+
+def parse_day(s: str):
+    # accetta YYYY-MM-DD, YYYY-MM-DDx, YYYY-MM-DD-anything
+    base = s[:10]
+    return datetime.strptime(base, "%Y-%m-%d")
+
+def main():
+    states = {}
+
+    with open(LEDGER, "r", encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            obj = json.loads(line)
+            rid = obj.get("record_id")
+            if not rid:
+                continue
+
+            schema = obj.get("schema")
+
+            if schema == "crovia_necessity_state.v1":
+                states[rid] = {
+                    "record_id": rid,
+                    "source_id": obj.get("source_id"),
+                    "necessity_id": obj.get("necessity_id"),
+                    "first_seen": obj.get("first_seen"),
+                    "last_seen": obj.get("last_seen")
+                }
+
+            elif schema == "crovia_necessity_state_update.v1":
+                if rid in states:
+                    states[rid]["last_seen"] = obj.get("last_seen")
+
+    OUT.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(OUT, "w", encoding="utf-8") as out:
+        for s in states.values():
+            fs = parse_day(s["first_seen"])
+            ls = parse_day(s["last_seen"])
+            days = (ls - fs).days + 1
+
+            out.write(json.dumps({
+                "record_id": s["record_id"],
+                "source_id": s["source_id"],
+                "necessity_id": s["necessity_id"],
+                "days_persistent": days,
+                "from": s["first_seen"],
+                "to": s["last_seen"]
+            }) + "\n")
+
+    print(f"Wrote persistence for {len(states)} records → {OUT}")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+import json, hashlib
+from pathlib import Path
+from datetime import datetime, timezone
+
+BASE = Path(__file__).resolve().parent.parent
+LEDGER = BASE / "data" / "ledger" / "ledger.ndjson"
+PERSIST = BASE / "data" / "metrics" / "persistence.ndjson"
+SNAPDIR = BASE / "data" / "snapshots"
+
+SNAPDIR.mkdir(parents=True, exist_ok=True)
+
+def sha256(path):
+    h = hashlib.sha256()
+    with open(path, "rb") as f:
+        for b in iter(lambda: f.read(8192), b""):
+            h.update(b)
+    return h.hexdigest()
+
+def main():
+    ts = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    snap = SNAPDIR / f"snapshot_{ts}.json"
+
+    data = {
+        "date": ts,
+        "ledger_sha256": sha256(LEDGER),
+        "persistence_sha256": sha256(PERSIST),
+        "records": sum(1 for _ in open(LEDGER)),
+        "persistence_records": sum(1 for _ in open(PERSIST))
+    }
+
+    snap.write_text(json.dumps(data, indent=2))
+    print(f"Snapshot written → {snap}")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+import json
+from pathlib import Path
+from collections import defaultdict
+from math import log
+
+BASE = Path("/opt/crovia/hubble")
+PERSIST = BASE / "data" / "metrics" / "persistence.ndjson"
+OUT = BASE / "data" / "metrics" / "global_ranking.json"
+
+def main():
+    stats = defaultdict(lambda: {"records": 0, "sum_days": 0})
+
+    for line in PERSIST.read_text().splitlines():
+        r = json.loads(line)
+        nid = r["necessity_id"]
+        d = r["days_persistent"]
+        stats[nid]["records"] += 1
+        stats[nid]["sum_days"] += d
+
+    ranking = []
+    for nid, s in stats.items():
+        avg = s["sum_days"] / s["records"]
+        score = s["records"] * log(1 + avg)
+        ranking.append({
+            "necessity_id": nid,
+            "records": s["records"],
+            "avg_persistence_days": round(avg, 2),
+            "score": round(score, 4)
+        })
+
+    ranking.sort(key=lambda x: x["score"], reverse=True)
+
+    OUT.write_text(json.dumps({
+        "generated_at": OUT.stat().st_mtime if OUT.exists() else None,
+        "ranking": ranking
+    }, indent=2))
+
+    print(f"Wrote ranking → {OUT}")
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+import requests, time, yaml
+
+OUT = "canon/sources.auto.yaml"
+LIMIT = 120
+SLEEP = 0.4
+
+def has_readme(dataset_id):
+    url = f"https://huggingface.co/datasets/{dataset_id}/raw/main/README.md"
+    r = requests.get(url, timeout=15)
+    return r.status_code == 200 and len(r.text) > 50
+
+def main():
+    api = "https://huggingface.co/api/datasets"
+    r = requests.get(api, timeout=30)
+    r.raise_for_status()
+    data = r.json()
+
+    # ordina per download se presente, altrimenti lascia ordine API
+    data.sort(key=lambda d: d.get("downloads", 0), reverse=True)
+
+    sources = []
+    for d in data:
+        if len(sources) >= LIMIT:
+            break
+        did = d.get("id")
+        if not did:
+            continue
+        try:
+            if not has_readme(did):
+                continue
+        except Exception:
+            continue
+
+        sources.append({
+            "id": "hf_" + did.replace("/", "_"),
+            "url": "https://huggingface.co/datasets/" + did
+        })
+        time.sleep(SLEEP)
+
+    with open(OUT, "w") as f:
+        yaml.dump({"sources": sources}, f, sort_keys=False)
+
+    print(f"Generated {len(sources)} HF sources → {OUT}")
+
+if __name__ == "__main__":
+    main()