Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added scripts/.gitkeep
Empty file.
206 changes: 206 additions & 0 deletions scripts/psog_archive_fancy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#!/usr/bin/env bash
# psog_archive_fancy.sh
# Fancy competitor-page archiver for ProSe research
#
# Usage:
# ./psog_archive_fancy.sh "<URL>" [--mirror] [--pdf] [--notes "quick notes here"] [--outdir ~/psog_archives]
#
# Examples:
# ./psog_archive_fancy.sh "https://example.com/page?utm=123" --pdf --notes "RevverDocs lead gen page"
# EXCLUDE_BIG=true ./psog_archive_fancy.sh "https://..." --mirror
set -euo pipefail

# ----------------- configuration -----------------
URL="${1:-}"
if [ -z "$URL" ]; then
cat <<EOF
Usage: $0 "<URL>" [--mirror] [--pdf] [--notes "text"] [--outdir DIR]
Options:
--mirror Attempt full wget mirror (requires wget)
--pdf Try to save PDF snapshot (uses chrome/chromium or wkhtmltopdf if available)
--notes TEXT Add a short note string to notes.md
--outdir DIR Base archive dir (default: \$HOME/psog_archives)
EOF
exit 2
fi

shift || true
MIRROR=false
PDF=false
NOTES=""
OUTBASE="${HOME}/psog_archives"
EXCLUDE_BIG=${EXCLUDE_BIG:-false} # set env var to true to avoid mirroring huge assets

while [ "$#" -gt 0 ]; do
case "$1" in
--mirror) MIRROR=true; shift ;;
--pdf) PDF=true; shift ;;
--notes) NOTES="$2"; shift 2 ;;
--outdir) OUTBASE="$2"; shift 2 ;;
*) echo "Unknown arg: $1"; exit 1 ;;
esac
done

# ----------------- helpers -----------------
now_ts() { date -u +"%Y%m%dT%H%M%SZ"; }
slugify() {
# naive slug: lowercase, remove protocol, replace non-alnum with _
echo "$1" | sed -E 's#^https?://##; s#[/?&=]#_#g; s#[^A-Za-z0-9._-]#_#g' | tr '[:upper:]' '[:lower:]' | cut -c1-120
}
which_chrome() {
for b in google-chrome-stable google-chrome chromium chromium-browser chrome; do
if command -v "$b" >/dev/null 2>&1; then
echo "$b"
return 0
fi
done
return 1
}
human_size() {
# simple bytes -> human
awk 'function human(x){
s="BKMGTPE"; n=0; while(x>1024 && n<6){x/=1024; n++}
return sprintf("%.1f%s",x,substr(s,n+1,1))
} {print human($1)}'
}

# ----------------- prepare paths -----------------
TS=$(now_ts)
CANONICAL=$(echo "$URL" | sed -E 's/[?].*$//')
SLUG=$(slugify "$CANONICAL")_"$TS"
ARCHDIR="$OUTBASE/${SLUG}"
mkdir -p "$ARCHDIR"

META="$ARCHDIR/metadata.txt"
INDEX="$OUTBASE/index.tsv"
PAGE="$ARCHDIR/page.html"
PAGE_SHA="$ARCHDIR/page.sha256"
HEADERS="$ARCHDIR/headers.txt"
FAVICON="$ARCHDIR/favicon.ico"
PDFPATH="$ARCHDIR/page.pdf"
MIRROR_DIR="$ARCHDIR/mirror"

# ----------------- metadata -----------------
cat > "$META" <<EOF
Saved: $(now_ts)
Source URL: $URL
Canonical URL: $CANONICAL
Saved-by: $(whoami)@$(hostname)
Notes: ${NOTES:-}
Tools-available: $(command -v curl >/dev/null 2>&1 && echo curl || echo -) $(command -v wget >/dev/null 2>&1 && echo wget || echo -) $(which_chrome >/dev/null 2>&1 && echo chrome || echo -) $(command -v wkhtmltopdf >/dev/null 2>&1 && echo wkhtmltopdf || echo -)
EOF

# ----------------- fetch headers + HTML -----------------
echo "Fetching headers..."
if command -v curl >/dev/null 2>&1; then
curl -L -sS -D - -o "$PAGE" "$URL" 2> "$HEADERS" || { echo "curl failed to fetch main page"; true; }
# curl wrote stdout to page (with -o) and wrote nothing to stderr except if errors; store response headers
# If -D wrote headers to stdout earlier, attempt a safer header fetch:
curl -I -L -sS "$URL" > "$HEADERS" || true
else
echo "curl not found; skipping HTML fetch"
fi

# compute SHA256 if page saved
if [ -s "$PAGE" ]; then
sha256sum "$PAGE" | awk '{print $1}' > "$PAGE_SHA"
echo "Saved HTML -> $PAGE (SHA256: $(cat $PAGE_SHA))"
else
echo "No HTML saved (page may be blocked or curl missing)."
fi

# ----------------- try to fetch favicon -----------------
# attempt common locations intelligently
echo "Attempting favicon fetch..."
FAV_GUESS=""
# try to parse <link rel="icon"...> from page quickly
if [ -s "$PAGE" ]; then
FAV_GUESS=$(grep -iEo '<link[^>]+rel=["'\'']?(shortcut icon|icon)[^>]*>' "$PAGE" | sed -nE "s/.*href=['\"]?([^'\" ]+).*/\1/p" | head -n1 || true)
fi
if [ -z "$FAV_GUESS" ]; then
# fallback to /favicon.ico
FAV_GUESS="${CANONICAL%/}/favicon.ico"
fi

if command -v curl >/dev/null 2>&1 && [ -n "$FAV_GUESS" ]; then
# if guess is relative, build absolute
if [[ "$FAV_GUESS" != http* ]]; then
base=$(echo "$CANONICAL" | sed -E 's#(https?://[^/]+).*#\1#')
FAV_GUESS="$base/$FAV_GUESS"
fi
curl -L -sS "$FAV_GUESS" -o "$FAVICON" || true
if [ -s "$FAVICON" ]; then
echo "Fetched favicon -> $FAVICON"
else
rm -f "$FAVICON" 2>/dev/null || true
echo "No favicon found at guessed location."
fi
fi

# ----------------- optional: full mirror with wget -----------------
if [ "$MIRROR" = true ] && command -v wget >/dev/null 2>&1; then
if [ "$EXCLUDE_BIG" = "true" ]; then
echo "Mirror requested but EXCLUDE_BIG=true; skipping large-asset heavy mirroring."
else
echo "Running wget mirror (may be large) into $MIRROR_DIR ..."
mkdir -p "$MIRROR_DIR"
wget --page-requisites --adjust-extension --convert-links --no-clobber -e robots=off --wait=1 -P "$MIRROR_DIR" --restrict-file-names=windows "$URL" 2>&1 | sed -n '1,200p' || true
# create a tar of mirror for compact archival
tar -C "$MIRROR_DIR" -czf "$ARCHDIR/mirror.tgz" . || true
echo "Mirror saved (and tarred) to $ARCHDIR/mirror.tgz"
fi
elif [ "$MIRROR" = true ]; then
echo "Mirror requested but wget not installed; skipping mirror."
fi

# ----------------- optional: PDF snapshot -----------------
if [ "$PDF" = true ]; then
echo "Attempting PDF snapshot..."
CHROME_BIN=$(which_chrome || true) || true
if [ -n "$CHROME_BIN" ] && command -v "$CHROME_BIN" >/dev/null 2>&1; then
echo "Using headless chrome ($CHROME_BIN) to print to PDF -> $PDFPATH"
# Chrome/Chromium headless print: some versions accept --headless --print-to-pdf=...
"$CHROME_BIN" --headless --disable-gpu --no-sandbox --print-to-pdf="$PDFPATH" "$URL" >/dev/null 2>&1 || {
echo "chrome print-to-pdf failed; trying wkhtmltopdf if available..."
}
fi
if [ ! -s "$PDFPATH" ] && command -v wkhtmltopdf >/dev/null 2>&1; then
wkhtmltopdf --quiet "$URL" "$PDFPATH" || true
fi
if [ -s "$PDFPATH" ]; then
echo "PDF saved: $PDFPATH ($(stat -c%s "$PDFPATH" | human_size))"
else
echo "PDF snapshot not produced (no suitable tool or failure)."
fi
fi

# ----------------- supplementary: index line, package, checksum -----------------
# compute an archive checksum (tar.gz of archived dir contents for integrity)
tar -C "$ARCHDIR" -czf "$ARCHDIR/package.tgz" . || true
if [ -s "$ARCHDIR/package.tgz" ]; then
sha256sum "$ARCHDIR/package.tgz" | awk '{print $1}' > "$ARCHDIR/package.tgz.sha256" || true
fi

# Append summary line to index.tsv (create file if missing)
mkdir -p "$OUTBASE"
IDX_LINE="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"$'\t'"$SLUG"$'\t'"$URL"$'\t'"$ARCHDIR"$'\t'"$( [ -s "$PAGE_SHA" ] && cat "$PAGE_SHA" || echo - )"$'\t'"$( [ -s "$ARCHDIR/package.tgz.sha256" ] && cat "$ARCHDIR/package.tgz.sha256" || echo - )"
echo -e "$IDX_LINE" >> "$INDEX"

# ----------------- final report -----------------
cat <<EOF

Saved archive: $ARCHDIR
metadata: $META
html: $PAGE $( [ -s "$PAGE" ] && echo "(sha256: $(cat $PAGE_SHA))" || echo "(not saved)" )
headers: $HEADERS
favicon: $( [ -s "$FAVICON" ] && echo "$FAVICON" || echo "not downloaded" )
package: $( [ -s "$ARCHDIR/package.tgz" ] && echo "$ARCHDIR/package.tgz (sha256: $(cat $ARCHDIR/package.tgz.sha256))" || echo "none" )
mirror: $( [ -d "$MIRROR_DIR" ] && echo "$MIRROR_DIR (mirror.tgz saved)" || echo "none" )
pdf: $( [ -s "$PDFPATH" ] && echo "$PDFPATH" || echo "none" )
Index updated: $INDEX

Hints:
- If you plan to do many archives, rsync the entire $OUTBASE to a USB/thumb drive or cloud backup.
- To avoid filling your Chromebook storage, use --mirror sparingly and set EXCLUDE_BIG=true in env.

EOF