diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 92a87c1..5e9c05b 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -26,7 +26,16 @@ Read [`LIBRARIAN.md`](../LIBRARIAN.md) before navigating the collections. It exp ## Tools ```bash +# Full reconstruction from a clean clone (configure .env first) +./bootstrap.sh + +# Restore cloud-storage symlinks only +./init-symlinks.sh + +# Download PDFs from an archive page python3 download.py "URL" --output-dir collections/NAME/pdfs + +# Convert PDFs to searchable Markdown python3 convert.py --input-dir collections/NAME/pdfs --output-dir collections/NAME/indexed ``` diff --git a/AGENTS.md b/AGENTS.md index 0e04e82..c0451d4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -26,7 +26,16 @@ Read [`LIBRARIAN.md`](LIBRARIAN.md) before navigating the collections. It explai ## Tools ```bash +# Full reconstruction from a clean clone (configure .env first) +./bootstrap.sh + +# Restore cloud-storage symlinks only +./init-symlinks.sh + +# Download PDFs from an archive page python3 download.py "URL" --output-dir collections/NAME/pdfs + +# Convert PDFs to searchable Markdown python3 convert.py --input-dir collections/NAME/pdfs --output-dir collections/NAME/indexed ``` diff --git a/CLAUDE.md b/CLAUDE.md index 8a5679c..b788d0f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -32,7 +32,8 @@ publication-library/ ├── convert.py ← convert PDFs → markdown + page PNGs ├── search.py ← search across indexed collections with formatted output ├── init-findings.sh ← scaffold the findings/ directory -├── init-symlinks.sh ← recreate cloud-storage symlinks (configure via .env) +├── init-symlinks.sh ← recreate cloud-storage symlinks (auto-derived from collections/) +├── bootstrap.sh ← full reconstruction pipeline from a clean clone ├── .env.template ← configuration template (copy to .env and set LIBRARY_BASE) ├── README.md ├── CLAUDE.md ← this file @@ -76,9 +77,11 @@ publication-library/ # Scaffold findings/ directory ./init-findings.sh -# Set up cloud-storage symlinks (first time or new machine) +# Full reconstruction from a clean clone (downloads, converts, catalogues) cp .env.template .env # then edit .env and set LIBRARY_BASE -# Edit the LINKS array in init-symlinks.sh for your collection layout +./bootstrap.sh + +# Restore symlinks only (LINKS auto-derived from collections/; override via .env) ./init-symlinks.sh # Download from an archive page diff --git a/GEMINI.md b/GEMINI.md index f5dd1f1..f9059eb 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -26,7 +26,16 @@ Read [`LIBRARIAN.md`](LIBRARIAN.md) before navigating the collections. It explai ## Tools ```bash +# Full reconstruction from a clean clone (configure .env first) +./bootstrap.sh + +# Restore cloud-storage symlinks only +./init-symlinks.sh + +# Download PDFs from an archive page python3 download.py "URL" --output-dir collections/NAME/pdfs + +# Convert PDFs to searchable Markdown python3 convert.py --input-dir collections/NAME/pdfs --output-dir collections/NAME/indexed ``` diff --git a/README.md b/README.md index 67908d8..8d41745 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,8 @@ Python 3.10+. No other dependencies. | `convert.py` | Convert a folder of PDFs to searchable Markdown with page images | | `search.py` | Search across all indexed collections with grouped, formatted output | | `init-findings.sh` | Scaffold the `findings/` directory, with optional cloud storage symlink | -| `init-symlinks.sh` | Recreate cloud-storage symlinks for PDFs, indexed output, and findings | +| `init-symlinks.sh` | Recreate cloud-storage symlinks (auto-derived from `collections/`) | +| `bootstrap.sh` | Full reconstruction pipeline: symlinks → download → convert → catalogue | --- @@ -136,7 +137,7 @@ making everything available across multiple machines without committing copyrigh Both `collections/*/pdfs`, `collections/*/indexed`, and `findings/` are gitignored, so symlinks to cloud folders work seamlessly with version control. -### Recommended folder layout +### Recommended cloud folder layout A clean convention is to store each collection's PDFs in a named folder, and use `library-` prefixed folders for the derived library assets (indexed output and findings). For example, using Dropbox: @@ -153,37 +154,43 @@ folders for the derived library assets (indexed output and findings). For exampl The `library-` prefix distinguishes library infrastructure from per-collection PDF archives at a glance. -### Set up symlinks with init-symlinks.sh +### One-command reconstruction with bootstrap.sh -`init-symlinks.sh` automates symlink creation. Configure it once, then run it after every clone or -on each new machine. - -**1. Set your library base path:** +After cloning on a new machine, `bootstrap.sh` rebuilds the entire library in one step: ```bash cp .env.template .env -# Edit .env and set LIBRARY_BASE to your cloud storage root, e.g.: +# Edit .env — set LIBRARY_BASE to your cloud storage root, e.g.: # LIBRARY_BASE="${HOME}/Dropbox/my-library" + +./bootstrap.sh ``` -**2. Edit the `LINKS` array in `init-symlinks.sh`** to list your collections: +This creates cloud directories, restores symlinks, downloads any missing PDFs (using the Source URL +from each `COLLECTION.md`), converts them to searchable Markdown, and regenerates `CATALOGUE.md`. +The script is idempotent — already-downloaded PDFs and already-converted output are skipped. + +### Symlinks only + +`init-symlinks.sh` restores symlinks without downloading or converting. Symlink targets are +auto-derived from `collections/` using the naming convention above. To override, define a `LINKS` +array in `.env`: ```bash -declare -a LINKS=( +# .env +LINKS=( "findings:${LIBRARY_BASE}/library-findings" "collections/collection-a/pdfs:${LIBRARY_BASE}/collection-a" "collections/collection-a/indexed:${LIBRARY_BASE}/library-indexed/collection-a" ) ``` -**3. Run it:** +Then run: ```bash ./init-symlinks.sh ``` -The script is idempotent — safe to re-run; existing symlinks are skipped. - --- ## Library catalogue diff --git a/bootstrap.sh b/bootstrap.sh new file mode 100755 index 0000000..d17cd95 --- /dev/null +++ b/bootstrap.sh @@ -0,0 +1,194 @@ +#!/usr/bin/env bash +# bootstrap.sh — Full reconstruction pipeline for a publication-library instance. +# +# Reconstructs the complete library from a clean clone: +# 1. Validates configuration (.env, LIBRARY_BASE) +# 2. Initialises the lib/pfb submodule if needed +# 3. Creates cloud-storage directories for each collection +# 4. Runs init-symlinks.sh to restore local symlinks +# 5. Downloads PDFs for each collection (skips if already present) +# 6. Converts PDFs to searchable Markdown (skips if already done) +# 7. Regenerates the cross-collection CATALOGUE.md +# +# Name: bootstrap.sh +# Description: Full reconstruction pipeline for a publication-library instance +# Author: Alister Lewis-Bowen +# Usage: ./bootstrap.sh +# Dependencies: bash 4+, python3, pymupdf, git, lib/pfb submodule +# Exit codes: 0 success, 1 error +# +# Configuration (via .env or environment): +# LIBRARY_BASE Absolute path to the root of your cloud-synced library storage. +# See .env.template for examples. + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Bootstrap +# --------------------------------------------------------------------------- + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "${SCRIPT_DIR}" + +# Source .env if present +if [[ -f "${SCRIPT_DIR}/.env" ]]; then + # shellcheck source=/dev/null + source "${SCRIPT_DIR}/.env" +fi + +# Initialise lib/pfb submodule if not already done +if [[ ! -f "${SCRIPT_DIR}/lib/pfb/pfb.sh" ]]; then + echo "Initialising lib/pfb submodule..." + git submodule update --init lib/pfb +fi + +# Load pfb for terminal output +# shellcheck source=lib/pfb/pfb.sh +source "${SCRIPT_DIR}/lib/pfb/pfb.sh" + +# --------------------------------------------------------------------------- +# Validate required configuration +# --------------------------------------------------------------------------- + +pfb heading "publication-library bootstrap" "📚" +echo + +if [[ ! -f "${SCRIPT_DIR}/.env" ]]; then + pfb error ".env not found." + pfb subheading "Copy .env.template to .env and set LIBRARY_BASE:" + pfb subheading " cp .env.template .env" + exit 1 +fi + +if [[ -z "${LIBRARY_BASE:-}" ]]; then + pfb error "LIBRARY_BASE is not set in .env." + pfb subheading "Edit .env and set LIBRARY_BASE to your cloud storage root." + exit 1 +fi + +pfb info "LIBRARY_BASE: ${LIBRARY_BASE}" +echo + +# --------------------------------------------------------------------------- +# Helper: parse Source URL from a COLLECTION.md +# @param $1 Path to COLLECTION.md +# @return Prints the source URL, or empty string if not found +# --------------------------------------------------------------------------- +parse_source_url() { + local col_md="${1}" + [[ -f "${col_md}" ]] || { echo ""; return; } + # Match: | **Source** | [text](URL) | + local url + url="$(grep -oP '\|\s+\*\*Source\*\*\s+\|\s+\[[^\]]+\]\(\K[^)]+' "${col_md}" || true)" + echo "${url}" +} + +# --------------------------------------------------------------------------- +# Phase 1 — Create cloud-storage directories +# --------------------------------------------------------------------------- + +pfb heading "Creating cloud-storage directories" "☁️" +echo + +mkdir -p "${LIBRARY_BASE}/library-findings" +pfb success "READY ${LIBRARY_BASE}/library-findings" + +for col_dir in "${SCRIPT_DIR}/collections"/*/; do + [[ -d "${col_dir}" ]] || continue + name="$(basename "${col_dir}")" + mkdir -p "${LIBRARY_BASE}/${name}" + pfb success "READY ${LIBRARY_BASE}/${name}" + mkdir -p "${LIBRARY_BASE}/library-indexed/${name}" + pfb success "READY ${LIBRARY_BASE}/library-indexed/${name}" +done + +echo + +# --------------------------------------------------------------------------- +# Phase 2 — Restore symlinks +# --------------------------------------------------------------------------- + +pfb heading "Restoring symlinks" "🔗" +echo +"${SCRIPT_DIR}/init-symlinks.sh" +echo + +# --------------------------------------------------------------------------- +# Phase 3 — Download PDFs +# --------------------------------------------------------------------------- + +pfb heading "Downloading PDFs" "⬇️" +echo + +for col_dir in "${SCRIPT_DIR}/collections"/*/; do + [[ -d "${col_dir}" ]] || continue + name="$(basename "${col_dir}")" + col_md="${col_dir}COLLECTION.md" + + source_url="$(parse_source_url "${col_md}")" + + if [[ -z "${source_url}" ]]; then + pfb warn "SKIP ${name} — no Source URL in COLLECTION.md" + continue + fi + + # Count PDFs already present (follow symlinks) + pdf_count="$(find -L "${col_dir}pdfs" -name "*.pdf" 2>/dev/null | wc -l | tr -d ' ')" + if [[ "${pdf_count}" -gt 0 ]]; then + pfb info "SKIP ${name} — ${pdf_count} PDFs already present" + continue + fi + + pfb subheading "${name} — downloading from ${source_url}" + python3 "${SCRIPT_DIR}/download.py" "${source_url}" \ + --output-dir "collections/${name}/pdfs" + pfb success "DONE ${name}" +done + +echo + +# --------------------------------------------------------------------------- +# Phase 4 — Convert PDFs to Markdown +# --------------------------------------------------------------------------- + +pfb heading "Converting PDFs to Markdown" "📄" +echo + +for col_dir in "${SCRIPT_DIR}/collections"/*/; do + [[ -d "${col_dir}" ]] || continue + name="$(basename "${col_dir}")" + + # Skip if indexed output already exists and is non-empty + indexed_count="$(find -L "${col_dir}indexed" -name "index.md" 2>/dev/null | wc -l | tr -d ' ')" + if [[ "${indexed_count}" -gt 0 ]]; then + pfb info "SKIP ${name} — already converted (${indexed_count} index files)" + continue + fi + + # Skip if there are no PDFs to convert + pdf_count="$(find -L "${col_dir}pdfs" -name "*.pdf" 2>/dev/null | wc -l | tr -d ' ')" + if [[ "${pdf_count}" -eq 0 ]]; then + pfb warn "SKIP ${name} — no PDFs found in collections/${name}/pdfs" + continue + fi + + pfb subheading "${name} — converting ${pdf_count} PDFs" + python3 "${SCRIPT_DIR}/convert.py" \ + --input-dir "collections/${name}/pdfs" \ + --output-dir "collections/${name}/indexed" \ + --pattern "**/*.pdf" \ + --write-collection-md + pfb success "DONE ${name}" +done + +echo + +# --------------------------------------------------------------------------- +# Phase 5 — Regenerate catalogue +# --------------------------------------------------------------------------- + +pfb heading "Regenerating CATALOGUE.md" "🗂️" +echo +python3 "${SCRIPT_DIR}/convert.py" --global-index collections/ +echo +pfb success "Done. Library reconstruction complete." diff --git a/init-symlinks.sh b/init-symlinks.sh index 57bb618..aa22556 100755 --- a/init-symlinks.sh +++ b/init-symlinks.sh @@ -16,11 +16,17 @@ # LIBRARY_BASE Absolute path to the root of your cloud-synced library storage. # See .env.template for examples. # -# LINKS format: "local_path:cloud_target" -# Edit the LINKS array below to match your instance's collection layout. -# - findings → flat directory in cloud storage -# - COLLECTION/pdfs → per-collection PDF directory in cloud storage -# - COLLECTION/indexed → per-collection indexed output under library-indexed/ +# LINKS are auto-derived from collections/*/ using the naming convention: +# collections/NAME/pdfs → ${LIBRARY_BASE}/NAME +# collections/NAME/indexed → ${LIBRARY_BASE}/library-indexed/NAME +# findings → ${LIBRARY_BASE}/library-findings +# +# To override, define a LINKS array in .env before running: +# LINKS=( +# "findings:${LIBRARY_BASE}/library-findings" +# "collections/NAME/pdfs:${LIBRARY_BASE}/NAME" +# "collections/NAME/indexed:${LIBRARY_BASE}/library-indexed/NAME" +# ) set -euo pipefail @@ -31,7 +37,7 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "${SCRIPT_DIR}" -# Source .env if present (allows LIBRARY_BASE to be set there) +# Source .env if present (allows LIBRARY_BASE and optional LINKS override to be set there) if [[ -f "${SCRIPT_DIR}/.env" ]]; then # shellcheck source=/dev/null source "${SCRIPT_DIR}/.env" @@ -62,22 +68,25 @@ if [[ -z "${LIBRARY_BASE:-}" ]]; then fi # --------------------------------------------------------------------------- -# Symlink definitions — edit to match your instance layout +# Build LINKS — auto-derive from collections/*/ unless overridden in .env # --------------------------------------------------------------------------- -# Format: "local_path:cloud_target" -# local_path — path relative to this script (e.g. findings, collections/NAME/pdfs) -# cloud_target — absolute path under LIBRARY_BASE -# -# Example entries (uncomment and adapt): -# -# "findings:${LIBRARY_BASE}/library-findings" -# "collections/my-collection/pdfs:${LIBRARY_BASE}/my-collection" -# "collections/my-collection/indexed:${LIBRARY_BASE}/library-indexed/my-collection" -declare -a LINKS=( - # Add your symlink entries here. Example: - # "findings:${LIBRARY_BASE}/library-findings" -) +# @description Build the LINKS array from collections/*/ directories. +# @side_effects Populates the global LINKS array. +build_links() { + LINKS=() + LINKS+=("findings:${LIBRARY_BASE}/library-findings") + for col_dir in "${SCRIPT_DIR}/collections"/*/; do + [[ -d "${col_dir}" ]] || continue + name="$(basename "${col_dir}")" + LINKS+=("collections/${name}/pdfs:${LIBRARY_BASE}/${name}") + LINKS+=("collections/${name}/indexed:${LIBRARY_BASE}/library-indexed/${name}") + done +} + +if [[ -z "${LINKS[*]+set}" ]] || [[ ${#LINKS[@]} -eq 0 ]]; then + build_links +fi # --------------------------------------------------------------------------- # Main @@ -88,7 +97,7 @@ pfb subheading "LIBRARY_BASE: ${LIBRARY_BASE}" echo if [[ ${#LINKS[@]} -eq 0 ]]; then - pfb warn "No symlinks defined. Edit the LINKS array in this script." + pfb warn "No collections found under collections/ and no LINKS defined in .env." exit 0 fi