diff --git a/scripts/.gitkeep b/scripts/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/psog_archive_fancy.sh b/scripts/psog_archive_fancy.sh new file mode 100755 index 0000000..808c563 --- /dev/null +++ b/scripts/psog_archive_fancy.sh @@ -0,0 +1,206 @@ +#!/usr/bin/env bash +# psog_archive_fancy.sh +# Fancy competitor-page archiver for ProSe research +# +# Usage: +# ./psog_archive_fancy.sh "" [--mirror] [--pdf] [--notes "quick notes here"] [--outdir ~/psog_archives] +# +# Examples: +# ./psog_archive_fancy.sh "https://example.com/page?utm=123" --pdf --notes "RevverDocs lead gen page" +# EXCLUDE_BIG=true ./psog_archive_fancy.sh "https://..." --mirror +set -euo pipefail + +# ----------------- configuration ----------------- +URL="${1:-}" +if [ -z "$URL" ]; then + cat <" [--mirror] [--pdf] [--notes "text"] [--outdir DIR] +Options: + --mirror Attempt full wget mirror (requires wget) + --pdf Try to save PDF snapshot (uses chrome/chromium or wkhtmltopdf if available) + --notes TEXT Add a short note string to notes.md + --outdir DIR Base archive dir (default: \$HOME/psog_archives) +EOF + exit 2 +fi + +shift || true +MIRROR=false +PDF=false +NOTES="" +OUTBASE="${HOME}/psog_archives" +EXCLUDE_BIG=${EXCLUDE_BIG:-false} # set env var to true to avoid mirroring huge assets + +while [ "$#" -gt 0 ]; do + case "$1" in + --mirror) MIRROR=true; shift ;; + --pdf) PDF=true; shift ;; + --notes) NOTES="$2"; shift 2 ;; + --outdir) OUTBASE="$2"; shift 2 ;; + *) echo "Unknown arg: $1"; exit 1 ;; + esac +done + +# ----------------- helpers ----------------- +now_ts() { date -u +"%Y%m%dT%H%M%SZ"; } +slugify() { + # naive slug: lowercase, remove protocol, replace non-alnum with _ + echo "$1" | sed -E 's#^https?://##; s#[/?&=]#_#g; s#[^A-Za-z0-9._-]#_#g' | tr '[:upper:]' '[:lower:]' | cut -c1-120 +} +which_chrome() { + for b in google-chrome-stable google-chrome chromium chromium-browser chrome; do + if command -v "$b" >/dev/null 2>&1; then + echo "$b" + return 0 + fi + done + return 1 +} +human_size() { + # simple bytes -> human + awk 'function human(x){ + s="BKMGTPE"; n=0; while(x>1024 && n<6){x/=1024; n++} + return sprintf("%.1f%s",x,substr(s,n+1,1)) + } {print human($1)}' +} + +# ----------------- prepare paths ----------------- +TS=$(now_ts) +CANONICAL=$(echo "$URL" | sed -E 's/[?].*$//') +SLUG=$(slugify "$CANONICAL")_"$TS" +ARCHDIR="$OUTBASE/${SLUG}" +mkdir -p "$ARCHDIR" + +META="$ARCHDIR/metadata.txt" +INDEX="$OUTBASE/index.tsv" +PAGE="$ARCHDIR/page.html" +PAGE_SHA="$ARCHDIR/page.sha256" +HEADERS="$ARCHDIR/headers.txt" +FAVICON="$ARCHDIR/favicon.ico" +PDFPATH="$ARCHDIR/page.pdf" +MIRROR_DIR="$ARCHDIR/mirror" + +# ----------------- metadata ----------------- +cat > "$META" </dev/null 2>&1 && echo curl || echo -) $(command -v wget >/dev/null 2>&1 && echo wget || echo -) $(which_chrome >/dev/null 2>&1 && echo chrome || echo -) $(command -v wkhtmltopdf >/dev/null 2>&1 && echo wkhtmltopdf || echo -) +EOF + +# ----------------- fetch headers + HTML ----------------- +echo "Fetching headers..." +if command -v curl >/dev/null 2>&1; then + curl -L -sS -D - -o "$PAGE" "$URL" 2> "$HEADERS" || { echo "curl failed to fetch main page"; true; } + # curl wrote stdout to page (with -o) and wrote nothing to stderr except if errors; store response headers + # If -D wrote headers to stdout earlier, attempt a safer header fetch: + curl -I -L -sS "$URL" > "$HEADERS" || true +else + echo "curl not found; skipping HTML fetch" +fi + +# compute SHA256 if page saved +if [ -s "$PAGE" ]; then + sha256sum "$PAGE" | awk '{print $1}' > "$PAGE_SHA" + echo "Saved HTML -> $PAGE (SHA256: $(cat $PAGE_SHA))" +else + echo "No HTML saved (page may be blocked or curl missing)." +fi + +# ----------------- try to fetch favicon ----------------- +# attempt common locations intelligently +echo "Attempting favicon fetch..." +FAV_GUESS="" +# try to parse from page quickly +if [ -s "$PAGE" ]; then + FAV_GUESS=$(grep -iEo ']+rel=["'\'']?(shortcut icon|icon)[^>]*>' "$PAGE" | sed -nE "s/.*href=['\"]?([^'\" ]+).*/\1/p" | head -n1 || true) +fi +if [ -z "$FAV_GUESS" ]; then + # fallback to /favicon.ico + FAV_GUESS="${CANONICAL%/}/favicon.ico" +fi + +if command -v curl >/dev/null 2>&1 && [ -n "$FAV_GUESS" ]; then + # if guess is relative, build absolute + if [[ "$FAV_GUESS" != http* ]]; then + base=$(echo "$CANONICAL" | sed -E 's#(https?://[^/]+).*#\1#') + FAV_GUESS="$base/$FAV_GUESS" + fi + curl -L -sS "$FAV_GUESS" -o "$FAVICON" || true + if [ -s "$FAVICON" ]; then + echo "Fetched favicon -> $FAVICON" + else + rm -f "$FAVICON" 2>/dev/null || true + echo "No favicon found at guessed location." + fi +fi + +# ----------------- optional: full mirror with wget ----------------- +if [ "$MIRROR" = true ] && command -v wget >/dev/null 2>&1; then + if [ "$EXCLUDE_BIG" = "true" ]; then + echo "Mirror requested but EXCLUDE_BIG=true; skipping large-asset heavy mirroring." + else + echo "Running wget mirror (may be large) into $MIRROR_DIR ..." + mkdir -p "$MIRROR_DIR" + wget --page-requisites --adjust-extension --convert-links --no-clobber -e robots=off --wait=1 -P "$MIRROR_DIR" --restrict-file-names=windows "$URL" 2>&1 | sed -n '1,200p' || true + # create a tar of mirror for compact archival + tar -C "$MIRROR_DIR" -czf "$ARCHDIR/mirror.tgz" . || true + echo "Mirror saved (and tarred) to $ARCHDIR/mirror.tgz" + fi +elif [ "$MIRROR" = true ]; then + echo "Mirror requested but wget not installed; skipping mirror." +fi + +# ----------------- optional: PDF snapshot ----------------- +if [ "$PDF" = true ]; then + echo "Attempting PDF snapshot..." + CHROME_BIN=$(which_chrome || true) || true + if [ -n "$CHROME_BIN" ] && command -v "$CHROME_BIN" >/dev/null 2>&1; then + echo "Using headless chrome ($CHROME_BIN) to print to PDF -> $PDFPATH" + # Chrome/Chromium headless print: some versions accept --headless --print-to-pdf=... + "$CHROME_BIN" --headless --disable-gpu --no-sandbox --print-to-pdf="$PDFPATH" "$URL" >/dev/null 2>&1 || { + echo "chrome print-to-pdf failed; trying wkhtmltopdf if available..." + } + fi + if [ ! -s "$PDFPATH" ] && command -v wkhtmltopdf >/dev/null 2>&1; then + wkhtmltopdf --quiet "$URL" "$PDFPATH" || true + fi + if [ -s "$PDFPATH" ]; then + echo "PDF saved: $PDFPATH ($(stat -c%s "$PDFPATH" | human_size))" + else + echo "PDF snapshot not produced (no suitable tool or failure)." + fi +fi + +# ----------------- supplementary: index line, package, checksum ----------------- +# compute an archive checksum (tar.gz of archived dir contents for integrity) +tar -C "$ARCHDIR" -czf "$ARCHDIR/package.tgz" . || true +if [ -s "$ARCHDIR/package.tgz" ]; then + sha256sum "$ARCHDIR/package.tgz" | awk '{print $1}' > "$ARCHDIR/package.tgz.sha256" || true +fi + +# Append summary line to index.tsv (create file if missing) +mkdir -p "$OUTBASE" +IDX_LINE="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"$'\t'"$SLUG"$'\t'"$URL"$'\t'"$ARCHDIR"$'\t'"$( [ -s "$PAGE_SHA" ] && cat "$PAGE_SHA" || echo - )"$'\t'"$( [ -s "$ARCHDIR/package.tgz.sha256" ] && cat "$ARCHDIR/package.tgz.sha256" || echo - )" +echo -e "$IDX_LINE" >> "$INDEX" + +# ----------------- final report ----------------- +cat <