|
| 1 | +#!/usr/bin/env bash |
| 2 | +# |
| 3 | +# bench/phase0/run.sh — Run D: re-measure chromium escalation rate |
| 4 | +# with expanded discovery. Three-phase pipeline: |
| 5 | +# |
| 6 | +# Phase 1: trawl batch with pricing_url + expanded fallback selectors |
| 7 | +# Phase 2: trawl map on remaining misses, grep for pricing-like URLs |
| 8 | +# Phase 3: trawl batch on newly discovered URLs |
| 9 | +# |
| 10 | +# Compares results against the Lightpanda decision thresholds: |
| 11 | +# - n >= 500 reachable |
| 12 | +# - chromium escalation rate >= 15% |
| 13 | +# |
| 14 | +# Prior art: Run C (2026-04-10) hit 14.08% on n=355 reachable. |
| 15 | +# See docs/DECISIONS.md for the full history. |
| 16 | + |
| 17 | +set -euo pipefail |
| 18 | + |
| 19 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 20 | +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" |
| 21 | +SEED="${REPO_ROOT}/seed/companies.csv" |
| 22 | +RESULTS_DIR="${SCRIPT_DIR}/results-$(date +%Y%m%d-%H%M%S)" |
| 23 | +TRAWL_BIN="${TRAWL_BIN:-trawl}" |
| 24 | + |
| 25 | +# ── Configuration ────────────────────────────────────────────── |
| 26 | +N=1000 # first N rows from seed (same as Run C) |
| 27 | +CONCURRENCY=15 |
| 28 | +RATE=1 |
| 29 | +TIMEOUT=30s |
| 30 | +MAP_DEPTH=1 # BFS depth for map discovery |
| 31 | +MAP_LIMIT=50 # max URLs per homepage from map |
| 32 | +MAP_PARALLEL=4 # parallel trawl map invocations |
| 33 | +MAP_RATE=2 # rate limit per domain for map |
| 34 | + |
| 35 | +# Run C had 3 patterns. Run D has 18. |
| 36 | +FALLBACK_SELECTOR='a[href*="pricing"], a[href*="/plans"], a[href*="/price"], a[href*="/subscribe"], a[href*="/subscription"], a[href*="/upgrade"], a[href*="/buy"], a[href*="/packages"], a[href*="/billing"], a[href*="/pro"], a[href*="/premium"], a[href*="/enterprise"], a[href*="/features"], a[href*="/cost"], a[href*="/rates"], a[href*="/tiers"], a[href*="/get-started"], a[href*="/signup"], a[href*="/order"]' |
| 37 | + |
| 38 | +# Regex for filtering map output (extended grep, case-insensitive) |
| 39 | +PRICING_REGEX='/(pricing|plans?|prices?|subscribe|subscription|upgrade|buy|packages?|billing|pro|premium|enterprise|features|cost|rates?|tiers?|get-started|signup|order)(/|$|\?)' |
| 40 | + |
| 41 | +# ── Dependency checks ───────────────────────────────────────── |
| 42 | +for cmd in "${TRAWL_BIN}" jq python3; do |
| 43 | + if ! command -v "${cmd}" >/dev/null 2>&1; then |
| 44 | + echo "error: ${cmd} not on PATH" >&2 |
| 45 | + exit 1 |
| 46 | + fi |
| 47 | +done |
| 48 | +if [[ ! -f "${SEED}" ]]; then |
| 49 | + echo "error: seed file not found: ${SEED}" >&2 |
| 50 | + exit 1 |
| 51 | +fi |
| 52 | + |
| 53 | +# Isolated TRAWL_HOME so we don't contend with running jobs or |
| 54 | +# pollute the user's tier-cache / content-cache. |
| 55 | +export TRAWL_HOME="${RESULTS_DIR}/.trawl" |
| 56 | +mkdir -p "${TRAWL_HOME}" |
| 57 | +mkdir -p "${RESULTS_DIR}" |
| 58 | + |
| 59 | +echo "═══════════════════════════════════════════════════════" >&2 |
| 60 | +echo " Run D — Phase 0 Benchmark" >&2 |
| 61 | +echo " results: ${RESULTS_DIR}" >&2 |
| 62 | +echo "═══════════════════════════════════════════════════════" >&2 |
| 63 | +echo "" >&2 |
| 64 | + |
| 65 | +# ══════════════════════════════════════════════════════════════ |
| 66 | +# STEP 1: Extract first N rows from seed |
| 67 | +# ══════════════════════════════════════════════════════════════ |
| 68 | +echo "[step 1] extracting first ${N} rows from seed CSV..." >&2 |
| 69 | +head -1 "${SEED}" > "${RESULTS_DIR}/seed-${N}.csv" |
| 70 | +head -$((N + 1)) "${SEED}" | tail -n +2 >> "${RESULTS_DIR}/seed-${N}.csv" |
| 71 | +ACTUAL=$(tail -n +2 "${RESULTS_DIR}/seed-${N}.csv" | wc -l | tr -d ' ') |
| 72 | +echo " → ${ACTUAL} data rows" >&2 |
| 73 | + |
| 74 | +# ══════════════════════════════════════════════════════════════ |
| 75 | +# STEP 2: Phase 1 — batch with expanded fallback selector |
| 76 | +# ══════════════════════════════════════════════════════════════ |
| 77 | +echo "" >&2 |
| 78 | +echo "[step 2] phase 1: batch with expanded fallback (${ACTUAL} URLs)..." >&2 |
| 79 | +PHASE1_START=$(date +%s) |
| 80 | + |
| 81 | +"${TRAWL_BIN}" batch "${RESULTS_DIR}/seed-${N}.csv" \ |
| 82 | + --url-column pricing_url \ |
| 83 | + --fallback-column homepage \ |
| 84 | + --fallback-selector "${FALLBACK_SELECTOR}" \ |
| 85 | + --tiers http,chromium \ |
| 86 | + --concurrency "${CONCURRENCY}" \ |
| 87 | + --rate "${RATE}" \ |
| 88 | + --timeout "${TIMEOUT}" \ |
| 89 | + --browser-like \ |
| 90 | + --no-tier-learning \ |
| 91 | + --ignore-robots \ |
| 92 | + --job-id "phase0-rund-phase1" \ |
| 93 | + -o "${RESULTS_DIR}/phase1.jsonl" \ |
| 94 | + 2>&1 | grep -v '"level":"debug"' >&2 || true |
| 95 | + |
| 96 | +PHASE1_END=$(date +%s) |
| 97 | +PHASE1_OK=$(jq -r 'select(.failure_category == "success") | .url' "${RESULTS_DIR}/phase1.jsonl" | wc -l | tr -d ' ') |
| 98 | +PHASE1_TOTAL=$(wc -l < "${RESULTS_DIR}/phase1.jsonl" | tr -d ' ') |
| 99 | +echo " → ${PHASE1_OK}/${PHASE1_TOTAL} reachable in $((PHASE1_END - PHASE1_START))s" >&2 |
| 100 | + |
| 101 | +# ══════════════════════════════════════════════════════════════ |
| 102 | +# STEP 3: Identify failed rows → extract homepages for map |
| 103 | +# ══════════════════════════════════════════════════════════════ |
| 104 | +echo "" >&2 |
| 105 | +echo "[step 3] identifying failed rows for map discovery..." >&2 |
| 106 | + |
| 107 | +# Get successfully fetched canonical URLs from phase 1 |
| 108 | +jq -r 'select(.failure_category == "success") | .canonical_url' \ |
| 109 | + "${RESULTS_DIR}/phase1.jsonl" | sort -u > "${RESULTS_DIR}/phase1-ok.txt" |
| 110 | + |
| 111 | +# Use Python for proper CSV parsing (seed has quoted commas in description) |
| 112 | +python3 - "${RESULTS_DIR}/seed-${N}.csv" "${RESULTS_DIR}/phase1-ok.txt" "${RESULTS_DIR}/map-targets.txt" << 'PYEOF' |
| 113 | +import csv, sys |
| 114 | +
|
| 115 | +seed_path, ok_path, out_path = sys.argv[1], sys.argv[2], sys.argv[3] |
| 116 | +
|
| 117 | +# Load successfully fetched URLs |
| 118 | +with open(ok_path) as f: |
| 119 | + ok_urls = set(line.strip() for line in f if line.strip()) |
| 120 | +
|
| 121 | +# For each seed row, if the pricing_url wasn't fetched successfully, |
| 122 | +# emit the homepage as a map target (if it exists and is non-empty). |
| 123 | +targets = set() |
| 124 | +with open(seed_path, newline='') as f: |
| 125 | + reader = csv.DictReader(f) |
| 126 | + for row in reader: |
| 127 | + pricing = row.get('pricing_url', '').strip() |
| 128 | + homepage = row.get('homepage', '').strip() |
| 129 | + if pricing not in ok_urls and homepage: |
| 130 | + targets.add(homepage) |
| 131 | +
|
| 132 | +with open(out_path, 'w') as f: |
| 133 | + for url in sorted(targets): |
| 134 | + f.write(url + '\n') |
| 135 | +
|
| 136 | +print(f" → {len(targets)} homepages to map-discover", file=sys.stderr) |
| 137 | +PYEOF |
| 138 | + |
| 139 | +MAP_COUNT=$(wc -l < "${RESULTS_DIR}/map-targets.txt" | tr -d ' ') |
| 140 | + |
| 141 | +# ══════════════════════════════════════════════════════════════ |
| 142 | +# STEP 4: Phase 2 — trawl map on each homepage, filter for pricing |
| 143 | +# ══════════════════════════════════════════════════════════════ |
| 144 | +echo "" >&2 |
| 145 | +echo "[step 4] phase 2: map discovery on ${MAP_COUNT} homepages (depth ${MAP_DEPTH}, ${MAP_PARALLEL} parallel)..." >&2 |
| 146 | +PHASE2_START=$(date +%s) |
| 147 | + |
| 148 | +: > "${RESULTS_DIR}/map-raw.txt" |
| 149 | + |
| 150 | +# Export variables so the xargs subshells can see them. |
| 151 | +export TRAWL_BIN MAP_DEPTH MAP_LIMIT TIMEOUT MAP_RATE |
| 152 | + |
| 153 | +# xargs runs MAP_PARALLEL trawl map processes concurrently. |
| 154 | +# Each discovers URLs from one homepage; stdout is collected. |
| 155 | +# Failures are silently dropped (|| true) — map is best-effort. |
| 156 | +cat "${RESULTS_DIR}/map-targets.txt" | xargs -P "${MAP_PARALLEL}" -I{} \ |
| 157 | + bash -c ' |
| 158 | + urls=$("${TRAWL_BIN}" map "$1" \ |
| 159 | + --depth "${MAP_DEPTH}" \ |
| 160 | + --sources crawl \ |
| 161 | + --same-domain \ |
| 162 | + --limit "${MAP_LIMIT}" \ |
| 163 | + --browser-like \ |
| 164 | + --ignore-robots \ |
| 165 | + --timeout "${TIMEOUT}" \ |
| 166 | + --rate "${MAP_RATE}" \ |
| 167 | + 2>/dev/null) || true |
| 168 | + if [ -n "$urls" ]; then |
| 169 | + echo "$urls" |
| 170 | + fi |
| 171 | + ' _ {} >> "${RESULTS_DIR}/map-raw.txt" 2>/dev/null || true |
| 172 | + |
| 173 | +# Filter for pricing-like URLs and deduplicate against phase 1 |
| 174 | +grep -iE "${PRICING_REGEX}" "${RESULTS_DIR}/map-raw.txt" 2>/dev/null \ |
| 175 | + | sort -u \ |
| 176 | + | comm -23 - "${RESULTS_DIR}/phase1-ok.txt" \ |
| 177 | + > "${RESULTS_DIR}/phase2-urls.txt" || true |
| 178 | + |
| 179 | +PHASE2_END=$(date +%s) |
| 180 | +MAP_RAW=$(wc -l < "${RESULTS_DIR}/map-raw.txt" | tr -d ' ') |
| 181 | +DISCOVERED=$(wc -l < "${RESULTS_DIR}/phase2-urls.txt" | tr -d ' ') |
| 182 | +echo " → ${MAP_RAW} URLs discovered, ${DISCOVERED} pricing-like (new) in $((PHASE2_END - PHASE2_START))s" >&2 |
| 183 | + |
| 184 | +# ══════════════════════════════════════════════════════════════ |
| 185 | +# STEP 5: Phase 3 — batch the discovered URLs |
| 186 | +# ══════════════════════════════════════════════════════════════ |
| 187 | +if [ "${DISCOVERED}" -gt 0 ]; then |
| 188 | + echo "" >&2 |
| 189 | + echo "[step 5] phase 3: batch ${DISCOVERED} map-discovered URLs..." >&2 |
| 190 | + PHASE3_START=$(date +%s) |
| 191 | + |
| 192 | + "${TRAWL_BIN}" batch "${RESULTS_DIR}/phase2-urls.txt" \ |
| 193 | + --tiers http,chromium \ |
| 194 | + --concurrency "${CONCURRENCY}" \ |
| 195 | + --rate "${RATE}" \ |
| 196 | + --timeout "${TIMEOUT}" \ |
| 197 | + --browser-like \ |
| 198 | + --no-tier-learning \ |
| 199 | + --ignore-robots \ |
| 200 | + --job-id "phase0-rund-phase3" \ |
| 201 | + -o "${RESULTS_DIR}/phase3.jsonl" \ |
| 202 | + 2>&1 | grep -v '"level":"debug"' >&2 || true |
| 203 | + |
| 204 | + PHASE3_END=$(date +%s) |
| 205 | + PHASE3_OK=$(jq -r 'select(.failure_category == "success") | .url' "${RESULTS_DIR}/phase3.jsonl" | wc -l | tr -d ' ') |
| 206 | + echo " → ${PHASE3_OK} reachable from map-discovered in $((PHASE3_END - PHASE3_START))s" >&2 |
| 207 | +else |
| 208 | + echo "" >&2 |
| 209 | + echo "[step 5] phase 3: skipped (no new URLs from map)" >&2 |
| 210 | +fi |
| 211 | + |
| 212 | +# ══════════════════════════════════════════════════════════════ |
| 213 | +# STEP 6: Combine results and compute stats |
| 214 | +# ══════════════════════════════════════════════════════════════ |
| 215 | +echo "" >&2 |
| 216 | +echo "[step 6] computing stats..." >&2 |
| 217 | + |
| 218 | +cat "${RESULTS_DIR}/phase1.jsonl" > "${RESULTS_DIR}/combined.jsonl" |
| 219 | +[ -f "${RESULTS_DIR}/phase3.jsonl" ] && cat "${RESULTS_DIR}/phase3.jsonl" >> "${RESULTS_DIR}/combined.jsonl" |
| 220 | + |
| 221 | +jq -s ' |
| 222 | + def is_success: .failure_category == "success"; |
| 223 | +
|
| 224 | + { |
| 225 | + total: length, |
| 226 | + reachable: [.[] | select(is_success)] | length, |
| 227 | + tier_http: [.[] | select(is_success and .tier == "http")] | length, |
| 228 | + tier_chromium: [.[] | select(is_success and .tier == "chromium")] | length, |
| 229 | + failures: ( |
| 230 | + [.[] | select(is_success | not)] |
| 231 | + | group_by(.failure_category) |
| 232 | + | map({key: .[0].failure_category, value: length}) |
| 233 | + | from_entries |
| 234 | + ) |
| 235 | + } | |
| 236 | + . + { |
| 237 | + escalation_pct: (if .reachable > 0 then (.tier_chromium / .reachable * 10000 | round / 100) else 0 end) |
| 238 | + } |
| 239 | +' "${RESULTS_DIR}/combined.jsonl" > "${RESULTS_DIR}/stats.json" |
| 240 | + |
| 241 | +# ══════════════════════════════════════════════════════════════ |
| 242 | +# STEP 7: Print summary |
| 243 | +# ══════════════════════════════════════════════════════════════ |
| 244 | +echo "" >&2 |
| 245 | +echo "═══════════════════════════════════════════════════════" |
| 246 | +echo " Run D — Phase 0 Results" |
| 247 | +echo "═══════════════════════════════════════════════════════" |
| 248 | +jq -r ' |
| 249 | + " total records: \(.total)", |
| 250 | + " reachable (success): \(.reachable)", |
| 251 | + " tier http: \(.tier_http)", |
| 252 | + " tier chromium: \(.tier_chromium)", |
| 253 | + "", |
| 254 | + " chromium escalation rate: \(.escalation_pct)% (\(.tier_chromium)/\(.reachable))", |
| 255 | + "", |
| 256 | + " ── Threshold check ──", |
| 257 | + " n >= 500 reachable: \(if .reachable >= 500 then "PASS" else "FAIL (need \(500 - .reachable) more)" end)", |
| 258 | + " rate >= 15%: \(if .escalation_pct >= 15 then "CROSSED — Lightpanda reopens" else "BELOW — skip holds (\(.escalation_pct)% < 15%)" end)", |
| 259 | + "", |
| 260 | + " ── vs Run C ──", |
| 261 | + " Run C: 14.08% on n=355", |
| 262 | + " Run D: \(.escalation_pct)% on n=\(.reachable)", |
| 263 | + " delta: \((.escalation_pct - 14.08) * 100 | round / 100)pp rate, +\(.reachable - 355) reachable", |
| 264 | + "", |
| 265 | + " ── Failure breakdown ──", |
| 266 | + (.failures | to_entries | sort_by(-.value) | .[] | " \(.key): \(.value)") |
| 267 | +' "${RESULTS_DIR}/stats.json" |
| 268 | +echo "═══════════════════════════════════════════════════════" |
| 269 | +echo "" |
| 270 | +echo "artifacts: ${RESULTS_DIR}/" |
| 271 | +echo " seed-${N}.csv input (${ACTUAL} rows)" |
| 272 | +echo " phase1.jsonl batch results" |
| 273 | +echo " map-targets.txt homepages sent to map (${MAP_COUNT})" |
| 274 | +echo " map-raw.txt all URLs from map (${MAP_RAW})" |
| 275 | +echo " phase2-urls.txt pricing-like discoveries (${DISCOVERED})" |
| 276 | +[ -f "${RESULTS_DIR}/phase3.jsonl" ] && echo " phase3.jsonl map-discovery batch results" |
| 277 | +echo " combined.jsonl merged results" |
| 278 | +echo " stats.json machine-readable stats" |
0 commit comments