From 59b10c43bad6c04e60b5791dcc6f3dc4eeebd42c Mon Sep 17 00:00:00 2001
From: eric-wang-1990 <e.wang@databricks.com>
Date: Thu, 4 Jun 2026 23:49:59 +0000
Subject: [PATCH] ci: classify Reyden nightly E2E failures by root cause

The raw pass-rate on the SEA/Reyden nightly is dominated by expected failures
(unsupported DDL/types, no Thrift endpoint, no CloudFetch), which hides the
genuine driver bugs. Layer a coarse root-cause category on top of the existing
per-failure signature so the dashboard separates the two.

parse-trx-to-json.py:
- Refine signature_for() with Reyden-specific buckets: Thrift-on-SEA endpoint
  (ENDPOINT_NOT_FOUND + "Thrift server error"), CloudFetch download failure,
  and unsupported DDL/statement/type. Reorder so PARSE_SYNTAX_ERROR is matched
  before the broad assertion bucket (whose "expected" token also matches
  "Expected identifier" syntax errors).
- Add category_for() with three buckets and emit per-failure "category" plus a
  "by_category" rollup. Classification follows the failing step encoded in the
  message: a rejected CREATE surfaces as an "Unsupported ..." gap, whereas a
  value/cast mismatch means setup succeeded and the round-trip returned wrong
  data -- a real driver bug.

update-e2e-dashboard.py: propagate by_category into the runs.json summary row.

index.html: render a "By root-cause category" rollup with a color-coded legend,
and group the expanded failure detail by category then signature. Degrades
gracefully for older runs that predate by_category.

Co-authored-by: Isaac
---
 .github/e2e-dashboard/index.html        | 37 ++++++++++++-
 .github/scripts/parse-trx-to-json.py    | 74 ++++++++++++++++++++++++-
 .github/scripts/update-e2e-dashboard.py |  2 +-
 3 files changed, 108 insertions(+), 5 deletions(-)
diff --git a/.github/e2e-dashboard/index.html b/.github/e2e-dashboard/index.html
index 14807b33..600b8de8 100644
--- a/.github/e2e-dashboard/index.html
+++ b/.github/e2e-dashboard/index.html
@@ -55,6 +55,12 @@
   .muted { color:var(--muted); } .small { font-size:12px; }
   .group-h { margin:12px 0 4px; font-size:13px; font-weight:700; }
   canvas { max-height:340px; }
+  .catbadge { display:inline-block; font-size:12px; font-weight:600; padding:2px 8px; border-radius:6px; }
+  .cat-gap  { background:#e8eefc; color:#2949b8; }
+  .cat-env  { background:#fdf3dc; color:#9a7400; }
+  .cat-real { background:#fbe7e7; color:#d83b3b; }
+  .legend { list-style:none; margin:10px 0 4px; padding:0; font-size:12px; color:var(--muted); }
+  .legend li { margin:5px 0; line-height:1.5; }
 </style>
 </head>
 <body>
@@ -98,6 +104,21 @@ <h2>Run history <span class="muted small">(click a row to expand failures)</span
 const fmtDate = ts => { if (!ts) return "—"; const d = new Date(ts); return isNaN(d) ? ts : d.toISOString().slice(0,16).replace("T"," "); };
 const esc = s => (s||"").replace(/[&<>"]/g, c => ({"&":"&amp;","<":"&lt;",">":"&gt;",'"':"&quot;"}[c]));
 
+// Root-cause categories emitted by parse-trx-to-json.py. A low raw pass-rate is
+// usually dominated by expected Reyden gaps; the "Real issue" bucket is the
+// actual backlog. A test with a CREATE step that Reyden can't run fails AT that
+// step (an "Unsupported …" gap); a value mismatch means setup succeeded and the
+// round-trip returned wrong data — a genuine bug.
+const CATEGORIES = {
+  "Reyden capability gap (expected)": { cls:"cat-gap",
+    blurb:"The backend doesn't support the operation or protocol — unsupported DDL/statement/type (e.g. CREATE TABLE/SCHEMA), no Thrift endpoint on a SEA/REST-only warehouse, or CloudFetch. Expected: gate/skip these; they shouldn't count against the driver." },
+  "Environment / infra": { cls:"cat-env",
+    blurb:"Missing/misconfigured warehouse, read-only rejection, auth, timeout, or transport error. Not a driver bug — fix the environment or retry." },
+  "Real issue / to investigate": { cls:"cat-real",
+    blurb:"A genuine driver bug. A value/cast mismatch on an INSERT→SELECT→DELETE round-trip means setup succeeded but the data came back wrong (e.g. a SEA-path serialization difference); also covers SQL/syntax errors and anything unclassified." },
+};
+const catBadge = name => `<span class="catbadge ${(CATEGORIES[name]||{}).cls||""}">${esc(name)}</span>`;
+
 function bar(p, f, s) {
   const t = p+f+s || 1;
   return `<div class="bar" title="${p} passed / ${f} failed / ${s} skipped">
@@ -142,6 +163,18 @@ <h2>Run history <span class="muted small">(click a row to expand failures)</span
 function renderAnalysis(latest) {
   const el = document.getElementById("analysis");
   if (!latest || !latest.failed) { el.innerHTML = '<p class="muted">No failures in the latest run. 🎉</p>'; return; }
+  const catEntries = Object.entries(latest.by_category||{});
+  const catRows = catEntries.map(([k,v]) =>
+    `<tr><td>${catBadge(k)}</td><td class="num">${v}</td></tr>`).join("");
+  const legend = catEntries.length
+    ? `<ul class="legend">` + catEntries.map(([k]) =>
+        `<li>${catBadge(k)} ${esc((CATEGORIES[k]||{}).blurb||"")}</li>`).join("") + `</ul>`
+    : "";
+  const catSection = catEntries.length ? `
+    <div class="group-h">By root-cause category</div>
+    <table><thead><tr><th>Category</th><th class="num">Count</th></tr></thead><tbody>${catRows}</tbody></table>
+    ${legend}` : "";
+
   const sigRows = Object.entries(latest.by_signature||{}).map(([k,v]) =>
     `<tr><td class="sig">${esc(k)}</td><td class="num">${v}</td></tr>`).join("");
   const clsRows = Object.entries(latest.by_class||{}).map(([k,v]) =>
@@ -149,6 +182,7 @@ <h2>Run history <span class="muted small">(click a row to expand failures)</span
   el.innerHTML = `
     <p class="muted small">Latest run <a href="${esc(latest.html_url)}">#${esc(latest.run_number||latest.run_id)}</a>
        (${esc(latest.protocol)}${latest.read_only?", read-only":""}) — ${latest.failed} failures grouped below.</p>
+    ${catSection}
     <div class="group-h">By failure signature (root cause)</div>
     <table><thead><tr><th>Signature</th><th class="num">Count</th></tr></thead><tbody>${sigRows}</tbody></table>
     <div class="group-h">By test class</div>
@@ -167,8 +201,9 @@ <h2>Run history <span class="muted small">(click a row to expand failures)</span
     const r = await fetch("data/" + run.detail);
     const full = await r.json();
     if (!full.failures || !full.failures.length) { det.firstChild.innerHTML = '<span class="muted">No failure detail recorded.</span>'; return; }
-    let html = "", curSig = null;
+    let html = "", curCat = null, curSig = null;
     for (const f of full.failures) {
+      if (f.category && f.category !== curCat) { curCat = f.category; curSig = null; html += `<div class="group-h">${catBadge(f.category)}</div>`; }
       if (f.signature !== curSig) { curSig = f.signature; html += `<div class="group-h sig">${esc(curSig)}</div>`; }
       html += `<ul class="fail-list"><li><div class="tname">${esc(f.name)}</div>` +
               (f.message ? `<div class="msg">${esc(f.message)}</div>` : "") + `</li></ul>`;
diff --git a/.github/scripts/parse-trx-to-json.py b/.github/scripts/parse-trx-to-json.py
index 4bbdcbb1..f33d46e1 100644
--- a/.github/scripts/parse-trx-to-json.py
+++ b/.github/scripts/parse-trx-to-json.py
@@ -48,16 +48,43 @@ def signature_for(message):
         return "Unknown / no message"
     m = message.replace("\r", " ").replace("\n", " ")
 
+    # Order matters: the most specific patterns win. In particular the
+    # Thrift-on-SEA and CloudFetch buckets must be matched before the generic
+    # "Couldn't connect / HttpRequestException" transport bucket, because their
+    # messages also contain that text.
     patterns = [
+        # --- Reyden capability gaps (expected; see SIG_CATEGORY) -------------
+        # A Thrift session against a SEA/REST-only warehouse (e.g. Reyden) has
+        # no Thrift endpoint, so the server returns ENDPOINT_NOT_FOUND. This is
+        # NOT a missing/misconfigured warehouse — the warehouse works over REST.
+        (r"Thrift server error.*ENDPOINT_NOT_FOUND|ENDPOINT_NOT_FOUND.*Thrift server error",
+         "Thrift endpoint unavailable on SEA/Reyden warehouse"),
+        (r"Error in download process|CloudFetch.*download",
+         "CloudFetch not supported on Reyden (download failed)"),
+        (r"PARSER_UNSUPPORTED_FEATURE|UNSUPPORTED_FEATURE|Unsupported statement|"
+         r"Unsupported CREATE type|Unsupported Delta table type|Unsupported .*type",
+         "Reyden unsupported feature (DDL / statement / type)"),
+        # --- Environment / infra --------------------------------------------
         (r"ENDPOINT_NOT_FOUND", "Warehouse not found (ENDPOINT_NOT_FOUND / HTTP 404)"),
         (r"read[- ]?only|READ_ONLY|cannot be modified|not.*allowed.*read", "Read-only warehouse rejected write/DDL"),
-        (r"INSERT|UPDATE|DELETE|MERGE|CREATE TABLE|DROP TABLE|ALTER TABLE", "DML/DDL rejected"),
         (r"PERMISSION_DENIED|not authorized|Forbidden|HTTP 403", "Permission denied (403)"),
         (r"timeout|timed out|TimeoutException", "Timeout"),
         (r"Couldn't connect|connection refused|HttpRequestException", "Connection / transport error"),
-        (r"TABLE_OR_VIEW_NOT_FOUND|cannot be found|does not exist", "Object not found"),
+        # --- Genuine SQL errors with specific codes ------------------------
+        # Checked before the broad assertion bucket below, whose "expected"
+        # token also appears in "Expected identifier ..." syntax-error text.
         (r"PARSE_SYNTAX_ERROR|SYNTAX_ERROR", "SQL syntax error"),
+        # --- Genuine driver bugs --------------------------------------------
+        # A value/cast mismatch means the test got PAST any setup (a rejected
+        # CREATE TABLE would have failed earlier with an "Unsupported …" message
+        # in the Reyden-gap bucket above). So a wrong value on an
+        # INSERT→SELECT→DELETE round-trip is a real driver bug, e.g. a SEA-path
+        # result-serialization difference — not expected Reyden behaviour.
+        (r"CAST_INVALID_INPUT", "Type cast mismatch on round-trip"),
+        (r"TABLE_OR_VIEW_NOT_FOUND|cannot be found|does not exist", "Object not found"),
         (r"Assert\.|Equal\(|Xunit|expected", "Assertion failed (value mismatch)"),
+        # --- Generic DML/DDL rejection (catch-all, lowest priority) --------
+        (r"INSERT|UPDATE|DELETE|MERGE|CREATE TABLE|DROP TABLE|ALTER TABLE", "DML/DDL rejected"),
     ]
     for pat, label in patterns:
         if re.search(pat, m, re.IGNORECASE):
@@ -69,6 +96,43 @@ def signature_for(message):
     return "Other"
 
 
+# Root-cause category layered on top of the fine-grained signature. The
+# dashboard rolls failures up to these three buckets so a low raw pass-rate
+# (dominated by expected Reyden gaps) doesn't mask the genuine driver bugs.
+#
+# Classification hinges on WHICH step failed, which the message already encodes:
+#   - A test with a CREATE TABLE/SCHEMA step that Reyden can't run fails AT that
+#     step with an "Unsupported …" message -> CAT_REYDEN_GAP (expected).
+#   - A value/cast mismatch means setup succeeded and the INSERT→SELECT→DELETE
+#     round-trip returned wrong data -> CAT_REAL (a genuine driver bug).
+CAT_REYDEN_GAP = "Reyden capability gap (expected)"
+CAT_ENVIRONMENT = "Environment / infra"
+CAT_REAL = "Real issue / to investigate"
+
+# Explicit signature -> category map. Any signature not listed here (including
+# the dynamic "<ExceptionType>" fallbacks and the value/cast/DML/syntax buckets)
+# is treated as CAT_REAL so genuine, unclassified failures surface rather than
+# hide.
+_SIGNATURE_CATEGORY = {
+    "Thrift endpoint unavailable on SEA/Reyden warehouse": CAT_REYDEN_GAP,
+    "CloudFetch not supported on Reyden (download failed)": CAT_REYDEN_GAP,
+    "Reyden unsupported feature (DDL / statement / type)": CAT_REYDEN_GAP,
+    "Warehouse not found (ENDPOINT_NOT_FOUND / HTTP 404)": CAT_ENVIRONMENT,
+    "Read-only warehouse rejected write/DDL": CAT_ENVIRONMENT,
+    "Permission denied (403)": CAT_ENVIRONMENT,
+    "Timeout": CAT_ENVIRONMENT,
+    "Connection / transport error": CAT_ENVIRONMENT,
+    # "Assertion failed (value mismatch)", "Type cast mismatch on round-trip",
+    # "Object not found", "DML/DDL rejected", "SQL syntax error", "Other",
+    # "Unknown / no message" and any "<ExceptionType>" fall through to CAT_REAL.
+}
+
+
+def category_for(signature):
+    """Map a fine-grained signature to one of the four root-cause buckets."""
+    return _SIGNATURE_CATEGORY.get(signature, CAT_REAL)
+
+
 def class_of(test_name):
     """Best-effort owning class: strip the parameter list and the method."""
     base = test_name.split("(", 1)[0]
@@ -150,6 +214,7 @@ def main():
     # Trim payload: keep full detail for failures only.
     for r in failed:
         r["signature"] = signature_for(r["message"])
+        r["category"] = category_for(r["signature"])
         if len(r["message"]) > 2000:
             r["message"] = r["message"][:2000] + " …(truncated)"
         if len(r["stack"]) > 2000:
@@ -157,9 +222,11 @@ def main():
 
     by_signature = {}
     by_class = {}
+    by_category = {}
     for r in failed:
         by_signature[r["signature"]] = by_signature.get(r["signature"], 0) + 1
         by_class[r["class"]] = by_class.get(r["class"], 0) + 1
+        by_category[r["category"]] = by_category.get(r["category"], 0) + 1
 
     total = len(all_results)
     record = {
@@ -182,9 +249,10 @@ def main():
         "failed": len(failed),
         "skipped": len(skipped),
         "pass_rate": round(100.0 * len(passed) / total, 1) if total else 0.0,
+        "by_category": dict(sorted(by_category.items(), key=lambda kv: -kv[1])),
         "by_signature": dict(sorted(by_signature.items(), key=lambda kv: -kv[1])),
         "by_class": dict(sorted(by_class.items(), key=lambda kv: -kv[1])),
-        "failures": sorted(failed, key=lambda r: (r["signature"], r["name"])),
+        "failures": sorted(failed, key=lambda r: (r["category"], r["signature"], r["name"])),
     }
 
     with open(out_path, "w") as f:
diff --git a/.github/scripts/update-e2e-dashboard.py b/.github/scripts/update-e2e-dashboard.py
index 6b03500c..d9b7b7e8 100644
--- a/.github/scripts/update-e2e-dashboard.py
+++ b/.github/scripts/update-e2e-dashboard.py
@@ -64,7 +64,7 @@ def main():
     summary = {k: record[k] for k in (
         "run_id", "run_attempt", "run_number", "timestamp", "commit", "branch",
         "protocol", "read_only", "html_url", "total", "passed", "failed",
-        "skipped", "pass_rate", "by_signature", "by_class",
+        "skipped", "pass_rate", "by_category", "by_signature", "by_class",
     ) if k in record}
     summary["detail"] = detail_name