From 59b10c43bad6c04e60b5791dcc6f3dc4eeebd42c Mon Sep 17 00:00:00 2001 From: eric-wang-1990 Date: Thu, 4 Jun 2026 23:49:59 +0000 Subject: [PATCH] ci: classify Reyden nightly E2E failures by root cause The raw pass-rate on the SEA/Reyden nightly is dominated by expected failures (unsupported DDL/types, no Thrift endpoint, no CloudFetch), which hides the genuine driver bugs. Layer a coarse root-cause category on top of the existing per-failure signature so the dashboard separates the two. parse-trx-to-json.py: - Refine signature_for() with Reyden-specific buckets: Thrift-on-SEA endpoint (ENDPOINT_NOT_FOUND + "Thrift server error"), CloudFetch download failure, and unsupported DDL/statement/type. Reorder so PARSE_SYNTAX_ERROR is matched before the broad assertion bucket (whose "expected" token also matches "Expected identifier" syntax errors). - Add category_for() with three buckets and emit per-failure "category" plus a "by_category" rollup. Classification follows the failing step encoded in the message: a rejected CREATE surfaces as an "Unsupported ..." gap, whereas a value/cast mismatch means setup succeeded and the round-trip returned wrong data -- a real driver bug. update-e2e-dashboard.py: propagate by_category into the runs.json summary row. index.html: render a "By root-cause category" rollup with a color-coded legend, and group the expanded failure detail by category then signature. Degrades gracefully for older runs that predate by_category. Co-authored-by: Isaac --- .github/e2e-dashboard/index.html | 37 ++++++++++++- .github/scripts/parse-trx-to-json.py | 74 ++++++++++++++++++++++++- .github/scripts/update-e2e-dashboard.py | 2 +- 3 files changed, 108 insertions(+), 5 deletions(-) diff --git a/.github/e2e-dashboard/index.html b/.github/e2e-dashboard/index.html index 14807b33..600b8de8 100644 --- a/.github/e2e-dashboard/index.html +++ b/.github/e2e-dashboard/index.html @@ -55,6 +55,12 @@ .muted { color:var(--muted); } .small { font-size:12px; } .group-h { margin:12px 0 4px; font-size:13px; font-weight:700; } canvas { max-height:340px; } + .catbadge { display:inline-block; font-size:12px; font-weight:600; padding:2px 8px; border-radius:6px; } + .cat-gap { background:#e8eefc; color:#2949b8; } + .cat-env { background:#fdf3dc; color:#9a7400; } + .cat-real { background:#fbe7e7; color:#d83b3b; } + .legend { list-style:none; margin:10px 0 4px; padding:0; font-size:12px; color:var(--muted); } + .legend li { margin:5px 0; line-height:1.5; } @@ -98,6 +104,21 @@

Run history (click a row to expand failures) { if (!ts) return "—"; const d = new Date(ts); return isNaN(d) ? ts : d.toISOString().slice(0,16).replace("T"," "); }; const esc = s => (s||"").replace(/[&<>"]/g, c => ({"&":"&","<":"<",">":">",'"':"""}[c])); +// Root-cause categories emitted by parse-trx-to-json.py. A low raw pass-rate is +// usually dominated by expected Reyden gaps; the "Real issue" bucket is the +// actual backlog. A test with a CREATE step that Reyden can't run fails AT that +// step (an "Unsupported …" gap); a value mismatch means setup succeeded and the +// round-trip returned wrong data — a genuine bug. +const CATEGORIES = { + "Reyden capability gap (expected)": { cls:"cat-gap", + blurb:"The backend doesn't support the operation or protocol — unsupported DDL/statement/type (e.g. CREATE TABLE/SCHEMA), no Thrift endpoint on a SEA/REST-only warehouse, or CloudFetch. Expected: gate/skip these; they shouldn't count against the driver." }, + "Environment / infra": { cls:"cat-env", + blurb:"Missing/misconfigured warehouse, read-only rejection, auth, timeout, or transport error. Not a driver bug — fix the environment or retry." }, + "Real issue / to investigate": { cls:"cat-real", + blurb:"A genuine driver bug. A value/cast mismatch on an INSERT→SELECT→DELETE round-trip means setup succeeded but the data came back wrong (e.g. a SEA-path serialization difference); also covers SQL/syntax errors and anything unclassified." }, +}; +const catBadge = name => `${esc(name)}`; + function bar(p, f, s) { const t = p+f+s || 1; return `
@@ -142,6 +163,18 @@

Run history (click a row to expand failures) + `${catBadge(k)}${v}`).join(""); + const legend = catEntries.length + ? `
    ` + catEntries.map(([k]) => + `
  • ${catBadge(k)} ${esc((CATEGORIES[k]||{}).blurb||"")}
  • `).join("") + `
` + : ""; + const catSection = catEntries.length ? ` +
By root-cause category
+ ${catRows}
CategoryCount
+ ${legend}` : ""; + const sigRows = Object.entries(latest.by_signature||{}).map(([k,v]) => `${esc(k)}${v}`).join(""); const clsRows = Object.entries(latest.by_class||{}).map(([k,v]) => @@ -149,6 +182,7 @@

Run history (click a row to expand failures)Latest run #${esc(latest.run_number||latest.run_id)} (${esc(latest.protocol)}${latest.read_only?", read-only":""}) — ${latest.failed} failures grouped below.

+ ${catSection}
By failure signature (root cause)
${sigRows}
SignatureCount
By test class
@@ -167,8 +201,9 @@

Run history (click a row to expand failures)${catBadge(f.category)}

`; } if (f.signature !== curSig) { curSig = f.signature; html += `
${esc(curSig)}
`; } html += ``; diff --git a/.github/scripts/parse-trx-to-json.py b/.github/scripts/parse-trx-to-json.py index 4bbdcbb1..f33d46e1 100644 --- a/.github/scripts/parse-trx-to-json.py +++ b/.github/scripts/parse-trx-to-json.py @@ -48,16 +48,43 @@ def signature_for(message): return "Unknown / no message" m = message.replace("\r", " ").replace("\n", " ") + # Order matters: the most specific patterns win. In particular the + # Thrift-on-SEA and CloudFetch buckets must be matched before the generic + # "Couldn't connect / HttpRequestException" transport bucket, because their + # messages also contain that text. patterns = [ + # --- Reyden capability gaps (expected; see SIG_CATEGORY) ------------- + # A Thrift session against a SEA/REST-only warehouse (e.g. Reyden) has + # no Thrift endpoint, so the server returns ENDPOINT_NOT_FOUND. This is + # NOT a missing/misconfigured warehouse — the warehouse works over REST. + (r"Thrift server error.*ENDPOINT_NOT_FOUND|ENDPOINT_NOT_FOUND.*Thrift server error", + "Thrift endpoint unavailable on SEA/Reyden warehouse"), + (r"Error in download process|CloudFetch.*download", + "CloudFetch not supported on Reyden (download failed)"), + (r"PARSER_UNSUPPORTED_FEATURE|UNSUPPORTED_FEATURE|Unsupported statement|" + r"Unsupported CREATE type|Unsupported Delta table type|Unsupported .*type", + "Reyden unsupported feature (DDL / statement / type)"), + # --- Environment / infra -------------------------------------------- (r"ENDPOINT_NOT_FOUND", "Warehouse not found (ENDPOINT_NOT_FOUND / HTTP 404)"), (r"read[- ]?only|READ_ONLY|cannot be modified|not.*allowed.*read", "Read-only warehouse rejected write/DDL"), - (r"INSERT|UPDATE|DELETE|MERGE|CREATE TABLE|DROP TABLE|ALTER TABLE", "DML/DDL rejected"), (r"PERMISSION_DENIED|not authorized|Forbidden|HTTP 403", "Permission denied (403)"), (r"timeout|timed out|TimeoutException", "Timeout"), (r"Couldn't connect|connection refused|HttpRequestException", "Connection / transport error"), - (r"TABLE_OR_VIEW_NOT_FOUND|cannot be found|does not exist", "Object not found"), + # --- Genuine SQL errors with specific codes ------------------------ + # Checked before the broad assertion bucket below, whose "expected" + # token also appears in "Expected identifier ..." syntax-error text. (r"PARSE_SYNTAX_ERROR|SYNTAX_ERROR", "SQL syntax error"), + # --- Genuine driver bugs -------------------------------------------- + # A value/cast mismatch means the test got PAST any setup (a rejected + # CREATE TABLE would have failed earlier with an "Unsupported …" message + # in the Reyden-gap bucket above). So a wrong value on an + # INSERT→SELECT→DELETE round-trip is a real driver bug, e.g. a SEA-path + # result-serialization difference — not expected Reyden behaviour. + (r"CAST_INVALID_INPUT", "Type cast mismatch on round-trip"), + (r"TABLE_OR_VIEW_NOT_FOUND|cannot be found|does not exist", "Object not found"), (r"Assert\.|Equal\(|Xunit|expected", "Assertion failed (value mismatch)"), + # --- Generic DML/DDL rejection (catch-all, lowest priority) -------- + (r"INSERT|UPDATE|DELETE|MERGE|CREATE TABLE|DROP TABLE|ALTER TABLE", "DML/DDL rejected"), ] for pat, label in patterns: if re.search(pat, m, re.IGNORECASE): @@ -69,6 +96,43 @@ def signature_for(message): return "Other" +# Root-cause category layered on top of the fine-grained signature. The +# dashboard rolls failures up to these three buckets so a low raw pass-rate +# (dominated by expected Reyden gaps) doesn't mask the genuine driver bugs. +# +# Classification hinges on WHICH step failed, which the message already encodes: +# - A test with a CREATE TABLE/SCHEMA step that Reyden can't run fails AT that +# step with an "Unsupported …" message -> CAT_REYDEN_GAP (expected). +# - A value/cast mismatch means setup succeeded and the INSERT→SELECT→DELETE +# round-trip returned wrong data -> CAT_REAL (a genuine driver bug). +CAT_REYDEN_GAP = "Reyden capability gap (expected)" +CAT_ENVIRONMENT = "Environment / infra" +CAT_REAL = "Real issue / to investigate" + +# Explicit signature -> category map. Any signature not listed here (including +# the dynamic "" fallbacks and the value/cast/DML/syntax buckets) +# is treated as CAT_REAL so genuine, unclassified failures surface rather than +# hide. +_SIGNATURE_CATEGORY = { + "Thrift endpoint unavailable on SEA/Reyden warehouse": CAT_REYDEN_GAP, + "CloudFetch not supported on Reyden (download failed)": CAT_REYDEN_GAP, + "Reyden unsupported feature (DDL / statement / type)": CAT_REYDEN_GAP, + "Warehouse not found (ENDPOINT_NOT_FOUND / HTTP 404)": CAT_ENVIRONMENT, + "Read-only warehouse rejected write/DDL": CAT_ENVIRONMENT, + "Permission denied (403)": CAT_ENVIRONMENT, + "Timeout": CAT_ENVIRONMENT, + "Connection / transport error": CAT_ENVIRONMENT, + # "Assertion failed (value mismatch)", "Type cast mismatch on round-trip", + # "Object not found", "DML/DDL rejected", "SQL syntax error", "Other", + # "Unknown / no message" and any "" fall through to CAT_REAL. +} + + +def category_for(signature): + """Map a fine-grained signature to one of the four root-cause buckets.""" + return _SIGNATURE_CATEGORY.get(signature, CAT_REAL) + + def class_of(test_name): """Best-effort owning class: strip the parameter list and the method.""" base = test_name.split("(", 1)[0] @@ -150,6 +214,7 @@ def main(): # Trim payload: keep full detail for failures only. for r in failed: r["signature"] = signature_for(r["message"]) + r["category"] = category_for(r["signature"]) if len(r["message"]) > 2000: r["message"] = r["message"][:2000] + " …(truncated)" if len(r["stack"]) > 2000: @@ -157,9 +222,11 @@ def main(): by_signature = {} by_class = {} + by_category = {} for r in failed: by_signature[r["signature"]] = by_signature.get(r["signature"], 0) + 1 by_class[r["class"]] = by_class.get(r["class"], 0) + 1 + by_category[r["category"]] = by_category.get(r["category"], 0) + 1 total = len(all_results) record = { @@ -182,9 +249,10 @@ def main(): "failed": len(failed), "skipped": len(skipped), "pass_rate": round(100.0 * len(passed) / total, 1) if total else 0.0, + "by_category": dict(sorted(by_category.items(), key=lambda kv: -kv[1])), "by_signature": dict(sorted(by_signature.items(), key=lambda kv: -kv[1])), "by_class": dict(sorted(by_class.items(), key=lambda kv: -kv[1])), - "failures": sorted(failed, key=lambda r: (r["signature"], r["name"])), + "failures": sorted(failed, key=lambda r: (r["category"], r["signature"], r["name"])), } with open(out_path, "w") as f: diff --git a/.github/scripts/update-e2e-dashboard.py b/.github/scripts/update-e2e-dashboard.py index 6b03500c..d9b7b7e8 100644 --- a/.github/scripts/update-e2e-dashboard.py +++ b/.github/scripts/update-e2e-dashboard.py @@ -64,7 +64,7 @@ def main(): summary = {k: record[k] for k in ( "run_id", "run_attempt", "run_number", "timestamp", "commit", "branch", "protocol", "read_only", "html_url", "total", "passed", "failed", - "skipped", "pass_rate", "by_signature", "by_class", + "skipped", "pass_rate", "by_category", "by_signature", "by_class", ) if k in record} summary["detail"] = detail_name