esengine · esengine · Jun 10, 2026 · Jun 9, 2026 · Jun 9, 2026
@@ -7,16 +7,31 @@ import (
 	"io"
 	"net/http"
 	"os"
+	"os/exec"
 	"path/filepath"
 	"regexp"
 	"strings"
+	"time"
+
+	"reasonix/internal/proc"
 )
 
 // maxFileRefBytes caps how much of an @-referenced file is injected into a
 // message, so "@somehuge.log" can't blow the context window. The head is kept
 // and the rest noted as truncated.
 const maxFileRefBytes = 64 * 1024
 
+const pdfExtractTimeout = 8 * time.Second
+const pdfExtractWaitDelay = 1 * time.Second
+
+var extractPDFText = extractPDFTextDefault
+
+type pdfExtractResult struct {
+	text      string
+	tool      string
+	truncated bool
+}
+
 // refKind distinguishes the two things an @reference can resolve to.
 type refKind int
 
@@ -268,6 +283,10 @@ func readFileRef(path string) (content string, isDir bool, err error) {
 		return b.String(), true, nil
 	}
 
+	if strings.EqualFold(filepath.Ext(path), ".pdf") {
+		return readPDFRef(path, info.Size()), false, nil
+	}
+
 	f, err := os.Open(path)
 	if err != nil {
 		return "", false, err
@@ -293,6 +312,145 @@ func readFileRef(path string) (content string, isDir bool, err error) {
 	return string(data), false, nil
 }
 
+func readPDFRef(path string, size int64) string {
+	result, err := extractPDFText(path)
+	if err != nil {
+		return fmt.Sprintf("[PDF file %s, %d bytes — text extraction unavailable: %v. If this is a scanned/image-only PDF, use OCR or an available multimodal/vision tool with this path.]", path, size, err)
+	}
+	text := strings.TrimSpace(result.text)
+	if text == "" {
+		return fmt.Sprintf("[PDF file %s, %d bytes — no extractable text found. It may be scanned/image-only; use OCR or an available multimodal/vision tool with this path.]", path, size)
+	}
+	var b strings.Builder
+	fmt.Fprintf(&b, "[PDF text extracted from %s using %s", path, result.tool)
+	if result.truncated {
+		fmt.Fprintf(&b, "; truncated to the first %d bytes", maxFileRefBytes)
+	}
+	b.WriteString("]\n")
+	b.WriteString(text)
+	return b.String()
+}
+
+func extractPDFTextDefault(path string) (pdfExtractResult, error) {
+	var firstErr error
+	if pdftotext, err := exec.LookPath("pdftotext"); err == nil {
+		if text, truncated, err := runPDFTextCommand(pdftotext, []string{"-enc", "UTF-8", "-layout", path, "-"}); err == nil {
+			return pdfExtractResult{text: text, tool: "pdftotext", truncated: truncated}, nil
+		} else {
+			firstErr = err
+		}
+	}
+	python, err := findPython()
+	if err != nil {
+		if firstErr != nil {
+			return pdfExtractResult{}, fmt.Errorf("pdftotext failed (%v), and Python PDF libraries are not available", firstErr)
+		}
+		return pdfExtractResult{}, fmt.Errorf("pdftotext and Python PDF libraries are not available")
+	}
+	text, truncated, err := runPDFTextCommand(python, []string{"-c", pythonPDFExtractScript, path})
+	if err != nil {
+		if firstErr != nil {
+			return pdfExtractResult{}, fmt.Errorf("pdftotext failed (%v), Python PDF extraction failed (%w)", firstErr, err)
+		}
+		return pdfExtractResult{}, err
+	}
+	return pdfExtractResult{text: text, tool: "Python PDF library", truncated: truncated}, nil
+}
+
+func findPython() (string, error) {
+	for _, name := range []string{"python3", "python", "py"} {
+		if p, err := exec.LookPath(name); err == nil {
+			return p, nil
+		}
+	}
+	return "", fmt.Errorf("python not found")
+}
+
+func runPDFTextCommand(name string, args []string) (string, bool, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), pdfExtractTimeout)
+	defer cancel()
+	cmd := exec.CommandContext(ctx, name, args...)
+	setShellKillTree(cmd)
+	cmd.WaitDelay = pdfExtractWaitDelay
+	proc.HideWindow(cmd)
+	var stdout limitedBuffer
+	var stderr limitedBuffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+	waitErr := cmd.Run()
+	if ctx.Err() == context.DeadlineExceeded {
+		return "", false, fmt.Errorf("PDF text extraction timed out")
+	}
+	if waitErr != nil {
+		msg := strings.TrimSpace(stderr.String())
+		if msg != "" {
+			if stderr.Truncated() {
+				msg += "\n…[truncated]…"
+			}
+			return "", false, fmt.Errorf("%w: %s", waitErr, msg)
+		}
+		return "", false, waitErr
+	}
+	return stdout.String(), stdout.Truncated(), nil
+}
+
+type limitedBuffer struct {
+	buf       bytes.Buffer
+	truncated bool
+}
+
+func (b *limitedBuffer) Write(p []byte) (int, error) {
+	remaining := maxFileRefBytes - b.buf.Len()
+	if remaining > 0 {
+		if len(p) > remaining {
+			_, _ = b.buf.Write(p[:remaining])
+			b.truncated = true
+		} else {
+			_, _ = b.buf.Write(p)
+		}
+	} else if len(p) > 0 {
+		b.truncated = true
+	}
+	return len(p), nil
+}
+
+func (b *limitedBuffer) String() string { return b.buf.String() }
+
+func (b *limitedBuffer) Truncated() bool { return b.truncated }
+
+const pythonPDFExtractScript = `
+import sys
+
+path = sys.argv[1]
+
+try:
+    from pypdf import PdfReader
+except Exception:
+    try:
+        from PyPDF2 import PdfReader
+    except Exception:
+        PdfReader = None
+
+if PdfReader is not None:
+    reader = PdfReader(path)
+    for page in reader.pages:
+        text = page.extract_text() or ""
+        if text:
+            print(text)
+    sys.exit(0)
+
+try:
+    import pdfplumber
+except Exception as exc:
+    raise SystemExit("no supported Python PDF library found") from exc
+
+with pdfplumber.open(path) as pdf:
+    for page in pdf.pages:
+        text = page.extract_text() or ""
+        if text:
+            print(text)
+`
+
 func imageMime(data []byte, path string) string {
 	mime := http.DetectContentType(data[:min(len(data), 512)])
 	if strings.HasPrefix(mime, "image/") {

@@ -202,6 +202,77 @@ func TestReadFileRef(t *testing.T) {
 	}
 }
 
+func TestReadFileRefPDFExtraction(t *testing.T) {
+	dir := t.TempDir()
+	pdfPath := filepath.Join(dir, "report.pdf")
+	if err := os.WriteFile(pdfPath, []byte("%PDF-1.4 fake"), 0o644); err != nil {
+		t.Fatal(err)
+	}
+
+	oldExtract := extractPDFText
+	t.Cleanup(func() { extractPDFText = oldExtract })
+
+	extractPDFText = func(path string) (pdfExtractResult, error) {
+		if path != pdfPath {
+			t.Fatalf("extract path = %q, want %q", path, pdfPath)
+		}
+		return pdfExtractResult{text: "Quarterly results\nRevenue up", tool: "test-extractor"}, nil
+	}
+	got, isDir, err := readFileRef(pdfPath)
+	if err != nil || isDir {
+		t.Fatalf("pdf text = (isDir=%v, err=%v)", isDir, err)
+	}
+	if !strings.Contains(got, "PDF text extracted") || !strings.Contains(got, "Revenue up") {
+		t.Fatalf("pdf text extraction missing from output: %s", got)
+	}
+
+	extractPDFText = func(string) (pdfExtractResult, error) {
+		return pdfExtractResult{text: "   ", tool: "test-extractor"}, nil
+	}
+	got, _, err = readFileRef(pdfPath)
+	if err != nil {
+		t.Fatalf("empty pdf text err = %v", err)
+	}
+	if !strings.Contains(got, "no extractable text") || !strings.Contains(got, "OCR") {
+		t.Fatalf("empty pdf should ask for OCR, got: %s", got)
+	}
+
+	extractPDFText = func(string) (pdfExtractResult, error) {
+		return pdfExtractResult{}, os.ErrNotExist
+	}
+	got, _, err = readFileRef(pdfPath)
+	if err != nil {
+		t.Fatalf("failed pdf text err = %v", err)
+	}
+	if !strings.Contains(got, "text extraction unavailable") || !strings.Contains(got, "multimodal/vision") {
+		t.Fatalf("failed pdf should mention OCR/vision fallback, got: %s", got)
+	}
+}
+
+func TestRunPDFTextCommandCapsStderr(t *testing.T) {
+	t.Setenv("GO_WANT_PDF_STDERR_HELPER", "1")
+
+	_, _, err := runPDFTextCommand(os.Args[0], []string{"-test.run=TestPDFStderrHelperProcess", "--"})
+	if err == nil {
+		t.Fatal("expected helper command to fail")
+	}
+	msg := err.Error()
+	if !strings.Contains(msg, "truncated") {
+		t.Fatalf("expected stderr truncation marker, got: %q", msg)
+	}
+	if len(msg) > maxFileRefBytes+1024 {
+		t.Fatalf("stderr error grew too large: len=%d", len(msg))
+	}
+}
+
+func TestPDFStderrHelperProcess(t *testing.T) {
+	if os.Getenv("GO_WANT_PDF_STDERR_HELPER") != "1" {
+		return
+	}
+	_, _ = os.Stderr.WriteString(strings.Repeat("x", maxFileRefBytes+4096))
+	os.Exit(7)
+}
+
 func TestResolveBareNamesDuplicates(t *testing.T) {
 	temp := t.TempDir()