cyserman · google-labs-jules · Nov 22, 2025
diff --git a/scripts/.gitkeep b/scripts/.gitkeep
diff --git a/scripts/psog_archive_fancy.sh b/scripts/psog_archive_fancy.sh
@@ -0,0 +1,206 @@
+#!/usr/bin/env bash
+# psog_archive_fancy.sh
+# Fancy competitor-page archiver for ProSe research
+#
+# Usage:
+#   ./psog_archive_fancy.sh "<URL>" [--mirror] [--pdf] [--notes "quick notes here"] [--outdir ~/psog_archives]
+#
+# Examples:
+#   ./psog_archive_fancy.sh "https://example.com/page?utm=123" --pdf --notes "RevverDocs lead gen page"
+#   EXCLUDE_BIG=true ./psog_archive_fancy.sh "https://..." --mirror
+set -euo pipefail
+
+# ----------------- configuration -----------------
+URL="${1:-}"
+if [ -z "$URL" ]; then
+  cat <<EOF
+Usage: $0 "<URL>" [--mirror] [--pdf] [--notes "text"] [--outdir DIR]
+Options:
+  --mirror      Attempt full wget mirror (requires wget)
+  --pdf         Try to save PDF snapshot (uses chrome/chromium or wkhtmltopdf if available)
+  --notes TEXT  Add a short note string to notes.md
+  --outdir DIR  Base archive dir (default: \$HOME/psog_archives)
+EOF
+  exit 2
+fi
+
+shift || true
+MIRROR=false
+PDF=false
+NOTES=""
+OUTBASE="${HOME}/psog_archives"
+EXCLUDE_BIG=${EXCLUDE_BIG:-false}  # set env var to true to avoid mirroring huge assets
+
+while [ "$#" -gt 0 ]; do
+  case "$1" in
+    --mirror) MIRROR=true; shift ;;
+    --pdf) PDF=true; shift ;;
+    --notes) NOTES="$2"; shift 2 ;;
+    --outdir) OUTBASE="$2"; shift 2 ;;
+    *) echo "Unknown arg: $1"; exit 1 ;;
+  esac
+done
+
+# ----------------- helpers -----------------
+now_ts() { date -u +"%Y%m%dT%H%M%SZ"; }
+slugify() {
+  # naive slug: lowercase, remove protocol, replace non-alnum with _
+  echo "$1" | sed -E 's#^https?://##; s#[/?&=]#_#g; s#[^A-Za-z0-9._-]#_#g' | tr '[:upper:]' '[:lower:]' | cut -c1-120
+}
+which_chrome() {
+  for b in google-chrome-stable google-chrome chromium chromium-browser chrome; do
+    if command -v "$b" >/dev/null 2>&1; then
+      echo "$b"
+      return 0
+    fi
+  done
+  return 1
+}
+human_size() {
+  # simple bytes -> human
+  awk 'function human(x){
+    s="BKMGTPE"; n=0; while(x>1024 && n<6){x/=1024; n++}
+    return sprintf("%.1f%s",x,substr(s,n+1,1))
+  } {print human($1)}'
+}
+
+# ----------------- prepare paths -----------------
+TS=$(now_ts)
+CANONICAL=$(echo "$URL" | sed -E 's/[?].*$//')
+SLUG=$(slugify "$CANONICAL")_"$TS"
+ARCHDIR="$OUTBASE/${SLUG}"
+mkdir -p "$ARCHDIR"
+
+META="$ARCHDIR/metadata.txt"
+INDEX="$OUTBASE/index.tsv"
+PAGE="$ARCHDIR/page.html"
+PAGE_SHA="$ARCHDIR/page.sha256"
+HEADERS="$ARCHDIR/headers.txt"
+FAVICON="$ARCHDIR/favicon.ico"
+PDFPATH="$ARCHDIR/page.pdf"
+MIRROR_DIR="$ARCHDIR/mirror"
+
+# ----------------- metadata -----------------
+cat > "$META" <<EOF
+Saved: $(now_ts)
+Source URL: $URL
+Canonical URL: $CANONICAL
+Saved-by: $(whoami)@$(hostname)
+Notes: ${NOTES:-}
+Tools-available: $(command -v curl >/dev/null 2>&1 && echo curl || echo -) $(command -v wget >/dev/null 2>&1 && echo wget || echo -) $(which_chrome >/dev/null 2>&1 && echo chrome || echo -) $(command -v wkhtmltopdf >/dev/null 2>&1 && echo wkhtmltopdf || echo -)
+EOF
+
+# ----------------- fetch headers + HTML -----------------
+echo "Fetching headers..."
+if command -v curl >/dev/null 2>&1; then
+  curl -L -sS -D - -o "$PAGE" "$URL" 2> "$HEADERS" || { echo "curl failed to fetch main page"; true; }
+  # curl wrote stdout to page (with -o) and wrote nothing to stderr except if errors; store response headers
+  # If -D wrote headers to stdout earlier, attempt a safer header fetch:
+  curl -I -L -sS "$URL" > "$HEADERS" || true
+else
+  echo "curl not found; skipping HTML fetch"
+fi
+
+# compute SHA256 if page saved
+if [ -s "$PAGE" ]; then
+  sha256sum "$PAGE" | awk '{print $1}' > "$PAGE_SHA"
+  echo "Saved HTML -> $PAGE  (SHA256: $(cat $PAGE_SHA))"
+else
+  echo "No HTML saved (page may be blocked or curl missing)."
+fi
+
+# ----------------- try to fetch favicon -----------------
+# attempt common locations intelligently
+echo "Attempting favicon fetch..."
+FAV_GUESS=""
+# try to parse <link rel="icon"...> from page quickly
+if [ -s "$PAGE" ]; then
+  FAV_GUESS=$(grep -iEo '<link[^>]+rel=["'\'']?(shortcut icon|icon)[^>]*>' "$PAGE" | sed -nE "s/.*href=['\"]?([^'\" ]+).*/\1/p" | head -n1 || true)
+fi
+if [ -z "$FAV_GUESS" ]; then
+  # fallback to /favicon.ico
+  FAV_GUESS="${CANONICAL%/}/favicon.ico"
+fi
+
+if command -v curl >/dev/null 2>&1 && [ -n "$FAV_GUESS" ]; then
+  # if guess is relative, build absolute
+  if [[ "$FAV_GUESS" != http* ]]; then
+    base=$(echo "$CANONICAL" | sed -E 's#(https?://[^/]+).*#\1#')
+    FAV_GUESS="$base/$FAV_GUESS"
+  fi
+  curl -L -sS "$FAV_GUESS" -o "$FAVICON" || true
+  if [ -s "$FAVICON" ]; then
+    echo "Fetched favicon -> $FAVICON"
+  else
+    rm -f "$FAVICON" 2>/dev/null || true
+    echo "No favicon found at guessed location."
+  fi
+fi
+
+# ----------------- optional: full mirror with wget -----------------
+if [ "$MIRROR" = true ] && command -v wget >/dev/null 2>&1; then
+  if [ "$EXCLUDE_BIG" = "true" ]; then
+    echo "Mirror requested but EXCLUDE_BIG=true; skipping large-asset heavy mirroring."
+  else
+    echo "Running wget mirror (may be large) into $MIRROR_DIR ..."
+    mkdir -p "$MIRROR_DIR"
+    wget --page-requisites --adjust-extension --convert-links --no-clobber -e robots=off --wait=1 -P "$MIRROR_DIR" --restrict-file-names=windows "$URL" 2>&1 | sed -n '1,200p' || true
+    # create a tar of mirror for compact archival
+    tar -C "$MIRROR_DIR" -czf "$ARCHDIR/mirror.tgz" . || true
+    echo "Mirror saved (and tarred) to $ARCHDIR/mirror.tgz"
+  fi
+elif [ "$MIRROR" = true ]; then
+  echo "Mirror requested but wget not installed; skipping mirror."
+fi
+
+# ----------------- optional: PDF snapshot -----------------
+if [ "$PDF" = true ]; then
+  echo "Attempting PDF snapshot..."
+  CHROME_BIN=$(which_chrome || true) || true
+  if [ -n "$CHROME_BIN" ] && command -v "$CHROME_BIN" >/dev/null 2>&1; then
+    echo "Using headless chrome ($CHROME_BIN) to print to PDF -> $PDFPATH"
+    # Chrome/Chromium headless print: some versions accept --headless --print-to-pdf=...
+    "$CHROME_BIN" --headless --disable-gpu --no-sandbox --print-to-pdf="$PDFPATH" "$URL" >/dev/null 2>&1 || {
+      echo "chrome print-to-pdf failed; trying wkhtmltopdf if available..."
+    }
+  fi
+  if [ ! -s "$PDFPATH" ] && command -v wkhtmltopdf >/dev/null 2>&1; then
+    wkhtmltopdf --quiet "$URL" "$PDFPATH" || true
+  fi
+  if [ -s "$PDFPATH" ]; then
+    echo "PDF saved: $PDFPATH  ($(stat -c%s "$PDFPATH" | human_size))"
+  else
+    echo "PDF snapshot not produced (no suitable tool or failure)."
+  fi
+fi
+
+# ----------------- supplementary: index line, package, checksum -----------------
+# compute an archive checksum (tar.gz of archived dir contents for integrity)
+tar -C "$ARCHDIR" -czf "$ARCHDIR/package.tgz" . || true
+if [ -s "$ARCHDIR/package.tgz" ]; then
+  sha256sum "$ARCHDIR/package.tgz" | awk '{print $1}' > "$ARCHDIR/package.tgz.sha256" || true
+fi
+
+# Append summary line to index.tsv (create file if missing)
+mkdir -p "$OUTBASE"
+IDX_LINE="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"$'\t'"$SLUG"$'\t'"$URL"$'\t'"$ARCHDIR"$'\t'"$( [ -s "$PAGE_SHA" ] && cat "$PAGE_SHA" || echo - )"$'\t'"$( [ -s "$ARCHDIR/package.tgz.sha256" ] && cat "$ARCHDIR/package.tgz.sha256" || echo - )"
+echo -e "$IDX_LINE" >> "$INDEX"
+
+# ----------------- final report -----------------
+cat <<EOF
+
+Saved archive: $ARCHDIR
+  metadata: $META
+  html:     $PAGE  $( [ -s "$PAGE" ] && echo "(sha256: $(cat $PAGE_SHA))" || echo "(not saved)" )
+  headers:  $HEADERS
+  favicon:  $( [ -s "$FAVICON" ] && echo "$FAVICON" || echo "not downloaded" )
+  package:  $( [ -s "$ARCHDIR/package.tgz" ] && echo "$ARCHDIR/package.tgz (sha256: $(cat $ARCHDIR/package.tgz.sha256))" || echo "none" )
+  mirror:   $( [ -d "$MIRROR_DIR" ] && echo "$MIRROR_DIR (mirror.tgz saved)" || echo "none" )
+  pdf:      $( [ -s "$PDFPATH" ] && echo "$PDFPATH" || echo "none" )
+Index updated: $INDEX
+
+Hints:
+ - If you plan to do many archives, rsync the entire $OUTBASE to a USB/thumb drive or cloud backup.
+ - To avoid filling your Chromebook storage, use --mirror sparingly and set EXCLUDE_BIG=true in env.
+
+EOF