diff --git a/internal/control/refs.go b/internal/control/refs.go index 99c9474cc..25790b8de 100644 --- a/internal/control/refs.go +++ b/internal/control/refs.go @@ -7,9 +7,13 @@ import ( "io" "net/http" "os" + "os/exec" "path/filepath" "regexp" "strings" + "time" + + "reasonix/internal/proc" ) // maxFileRefBytes caps how much of an @-referenced file is injected into a @@ -17,6 +21,17 @@ import ( // and the rest noted as truncated. const maxFileRefBytes = 64 * 1024 +const pdfExtractTimeout = 8 * time.Second +const pdfExtractWaitDelay = 1 * time.Second + +var extractPDFText = extractPDFTextDefault + +type pdfExtractResult struct { + text string + tool string + truncated bool +} + // refKind distinguishes the two things an @reference can resolve to. type refKind int @@ -268,6 +283,10 @@ func readFileRef(path string) (content string, isDir bool, err error) { return b.String(), true, nil } + if strings.EqualFold(filepath.Ext(path), ".pdf") { + return readPDFRef(path, info.Size()), false, nil + } + f, err := os.Open(path) if err != nil { return "", false, err @@ -293,6 +312,145 @@ func readFileRef(path string) (content string, isDir bool, err error) { return string(data), false, nil } +func readPDFRef(path string, size int64) string { + result, err := extractPDFText(path) + if err != nil { + return fmt.Sprintf("[PDF file %s, %d bytes — text extraction unavailable: %v. If this is a scanned/image-only PDF, use OCR or an available multimodal/vision tool with this path.]", path, size, err) + } + text := strings.TrimSpace(result.text) + if text == "" { + return fmt.Sprintf("[PDF file %s, %d bytes — no extractable text found. It may be scanned/image-only; use OCR or an available multimodal/vision tool with this path.]", path, size) + } + var b strings.Builder + fmt.Fprintf(&b, "[PDF text extracted from %s using %s", path, result.tool) + if result.truncated { + fmt.Fprintf(&b, "; truncated to the first %d bytes", maxFileRefBytes) + } + b.WriteString("]\n") + b.WriteString(text) + return b.String() +} + +func extractPDFTextDefault(path string) (pdfExtractResult, error) { + var firstErr error + if pdftotext, err := exec.LookPath("pdftotext"); err == nil { + if text, truncated, err := runPDFTextCommand(pdftotext, []string{"-enc", "UTF-8", "-layout", path, "-"}); err == nil { + return pdfExtractResult{text: text, tool: "pdftotext", truncated: truncated}, nil + } else { + firstErr = err + } + } + python, err := findPython() + if err != nil { + if firstErr != nil { + return pdfExtractResult{}, fmt.Errorf("pdftotext failed (%v), and Python PDF libraries are not available", firstErr) + } + return pdfExtractResult{}, fmt.Errorf("pdftotext and Python PDF libraries are not available") + } + text, truncated, err := runPDFTextCommand(python, []string{"-c", pythonPDFExtractScript, path}) + if err != nil { + if firstErr != nil { + return pdfExtractResult{}, fmt.Errorf("pdftotext failed (%v), Python PDF extraction failed (%w)", firstErr, err) + } + return pdfExtractResult{}, err + } + return pdfExtractResult{text: text, tool: "Python PDF library", truncated: truncated}, nil +} + +func findPython() (string, error) { + for _, name := range []string{"python3", "python", "py"} { + if p, err := exec.LookPath(name); err == nil { + return p, nil + } + } + return "", fmt.Errorf("python not found") +} + +func runPDFTextCommand(name string, args []string) (string, bool, error) { + ctx, cancel := context.WithTimeout(context.Background(), pdfExtractTimeout) + defer cancel() + cmd := exec.CommandContext(ctx, name, args...) + setShellKillTree(cmd) + cmd.WaitDelay = pdfExtractWaitDelay + proc.HideWindow(cmd) + var stdout limitedBuffer + var stderr limitedBuffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + waitErr := cmd.Run() + if ctx.Err() == context.DeadlineExceeded { + return "", false, fmt.Errorf("PDF text extraction timed out") + } + if waitErr != nil { + msg := strings.TrimSpace(stderr.String()) + if msg != "" { + if stderr.Truncated() { + msg += "\n…[truncated]…" + } + return "", false, fmt.Errorf("%w: %s", waitErr, msg) + } + return "", false, waitErr + } + return stdout.String(), stdout.Truncated(), nil +} + +type limitedBuffer struct { + buf bytes.Buffer + truncated bool +} + +func (b *limitedBuffer) Write(p []byte) (int, error) { + remaining := maxFileRefBytes - b.buf.Len() + if remaining > 0 { + if len(p) > remaining { + _, _ = b.buf.Write(p[:remaining]) + b.truncated = true + } else { + _, _ = b.buf.Write(p) + } + } else if len(p) > 0 { + b.truncated = true + } + return len(p), nil +} + +func (b *limitedBuffer) String() string { return b.buf.String() } + +func (b *limitedBuffer) Truncated() bool { return b.truncated } + +const pythonPDFExtractScript = ` +import sys + +path = sys.argv[1] + +try: + from pypdf import PdfReader +except Exception: + try: + from PyPDF2 import PdfReader + except Exception: + PdfReader = None + +if PdfReader is not None: + reader = PdfReader(path) + for page in reader.pages: + text = page.extract_text() or "" + if text: + print(text) + sys.exit(0) + +try: + import pdfplumber +except Exception as exc: + raise SystemExit("no supported Python PDF library found") from exc + +with pdfplumber.open(path) as pdf: + for page in pdf.pages: + text = page.extract_text() or "" + if text: + print(text) +` + func imageMime(data []byte, path string) string { mime := http.DetectContentType(data[:min(len(data), 512)]) if strings.HasPrefix(mime, "image/") { diff --git a/internal/control/refs_test.go b/internal/control/refs_test.go index a9f32a811..dc49450be 100644 --- a/internal/control/refs_test.go +++ b/internal/control/refs_test.go @@ -202,6 +202,77 @@ func TestReadFileRef(t *testing.T) { } } +func TestReadFileRefPDFExtraction(t *testing.T) { + dir := t.TempDir() + pdfPath := filepath.Join(dir, "report.pdf") + if err := os.WriteFile(pdfPath, []byte("%PDF-1.4 fake"), 0o644); err != nil { + t.Fatal(err) + } + + oldExtract := extractPDFText + t.Cleanup(func() { extractPDFText = oldExtract }) + + extractPDFText = func(path string) (pdfExtractResult, error) { + if path != pdfPath { + t.Fatalf("extract path = %q, want %q", path, pdfPath) + } + return pdfExtractResult{text: "Quarterly results\nRevenue up", tool: "test-extractor"}, nil + } + got, isDir, err := readFileRef(pdfPath) + if err != nil || isDir { + t.Fatalf("pdf text = (isDir=%v, err=%v)", isDir, err) + } + if !strings.Contains(got, "PDF text extracted") || !strings.Contains(got, "Revenue up") { + t.Fatalf("pdf text extraction missing from output: %s", got) + } + + extractPDFText = func(string) (pdfExtractResult, error) { + return pdfExtractResult{text: " ", tool: "test-extractor"}, nil + } + got, _, err = readFileRef(pdfPath) + if err != nil { + t.Fatalf("empty pdf text err = %v", err) + } + if !strings.Contains(got, "no extractable text") || !strings.Contains(got, "OCR") { + t.Fatalf("empty pdf should ask for OCR, got: %s", got) + } + + extractPDFText = func(string) (pdfExtractResult, error) { + return pdfExtractResult{}, os.ErrNotExist + } + got, _, err = readFileRef(pdfPath) + if err != nil { + t.Fatalf("failed pdf text err = %v", err) + } + if !strings.Contains(got, "text extraction unavailable") || !strings.Contains(got, "multimodal/vision") { + t.Fatalf("failed pdf should mention OCR/vision fallback, got: %s", got) + } +} + +func TestRunPDFTextCommandCapsStderr(t *testing.T) { + t.Setenv("GO_WANT_PDF_STDERR_HELPER", "1") + + _, _, err := runPDFTextCommand(os.Args[0], []string{"-test.run=TestPDFStderrHelperProcess", "--"}) + if err == nil { + t.Fatal("expected helper command to fail") + } + msg := err.Error() + if !strings.Contains(msg, "truncated") { + t.Fatalf("expected stderr truncation marker, got: %q", msg) + } + if len(msg) > maxFileRefBytes+1024 { + t.Fatalf("stderr error grew too large: len=%d", len(msg)) + } +} + +func TestPDFStderrHelperProcess(t *testing.T) { + if os.Getenv("GO_WANT_PDF_STDERR_HELPER") != "1" { + return + } + _, _ = os.Stderr.WriteString(strings.Repeat("x", maxFileRefBytes+4096)) + os.Exit(7) +} + func TestResolveBareNamesDuplicates(t *testing.T) { temp := t.TempDir()