Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 158 additions & 0 deletions internal/control/refs.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,31 @@ import (
"io"
"net/http"
"os"
"os/exec"
"path/filepath"
"regexp"
"strings"
"time"

"reasonix/internal/proc"
)

// maxFileRefBytes caps how much of an @-referenced file is injected into a
// message, so "@somehuge.log" can't blow the context window. The head is kept
// and the rest noted as truncated.
const maxFileRefBytes = 64 * 1024

const pdfExtractTimeout = 8 * time.Second
const pdfExtractWaitDelay = 1 * time.Second

var extractPDFText = extractPDFTextDefault

type pdfExtractResult struct {
text string
tool string
truncated bool
}

// refKind distinguishes the two things an @reference can resolve to.
type refKind int

Expand Down Expand Up @@ -268,6 +283,10 @@ func readFileRef(path string) (content string, isDir bool, err error) {
return b.String(), true, nil
}

if strings.EqualFold(filepath.Ext(path), ".pdf") {
return readPDFRef(path, info.Size()), false, nil
}

f, err := os.Open(path)
if err != nil {
return "", false, err
Expand All @@ -293,6 +312,145 @@ func readFileRef(path string) (content string, isDir bool, err error) {
return string(data), false, nil
}

func readPDFRef(path string, size int64) string {
result, err := extractPDFText(path)
if err != nil {
return fmt.Sprintf("[PDF file %s, %d bytes — text extraction unavailable: %v. If this is a scanned/image-only PDF, use OCR or an available multimodal/vision tool with this path.]", path, size, err)
}
text := strings.TrimSpace(result.text)
if text == "" {
return fmt.Sprintf("[PDF file %s, %d bytes — no extractable text found. It may be scanned/image-only; use OCR or an available multimodal/vision tool with this path.]", path, size)
}
var b strings.Builder
fmt.Fprintf(&b, "[PDF text extracted from %s using %s", path, result.tool)
if result.truncated {
fmt.Fprintf(&b, "; truncated to the first %d bytes", maxFileRefBytes)
}
b.WriteString("]\n")
b.WriteString(text)
return b.String()
}

func extractPDFTextDefault(path string) (pdfExtractResult, error) {
var firstErr error
if pdftotext, err := exec.LookPath("pdftotext"); err == nil {
if text, truncated, err := runPDFTextCommand(pdftotext, []string{"-enc", "UTF-8", "-layout", path, "-"}); err == nil {
return pdfExtractResult{text: text, tool: "pdftotext", truncated: truncated}, nil
} else {
firstErr = err
}
}
python, err := findPython()
if err != nil {
if firstErr != nil {
return pdfExtractResult{}, fmt.Errorf("pdftotext failed (%v), and Python PDF libraries are not available", firstErr)
}
return pdfExtractResult{}, fmt.Errorf("pdftotext and Python PDF libraries are not available")
}
text, truncated, err := runPDFTextCommand(python, []string{"-c", pythonPDFExtractScript, path})
if err != nil {
if firstErr != nil {
return pdfExtractResult{}, fmt.Errorf("pdftotext failed (%v), Python PDF extraction failed (%w)", firstErr, err)
}
return pdfExtractResult{}, err
}
return pdfExtractResult{text: text, tool: "Python PDF library", truncated: truncated}, nil
}

func findPython() (string, error) {
for _, name := range []string{"python3", "python", "py"} {
if p, err := exec.LookPath(name); err == nil {
return p, nil
}
}
return "", fmt.Errorf("python not found")
}

func runPDFTextCommand(name string, args []string) (string, bool, error) {
ctx, cancel := context.WithTimeout(context.Background(), pdfExtractTimeout)
defer cancel()
cmd := exec.CommandContext(ctx, name, args...)
Comment thread
SivanCola marked this conversation as resolved.
setShellKillTree(cmd)
cmd.WaitDelay = pdfExtractWaitDelay
proc.HideWindow(cmd)
var stdout limitedBuffer
var stderr limitedBuffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
waitErr := cmd.Run()
if ctx.Err() == context.DeadlineExceeded {
return "", false, fmt.Errorf("PDF text extraction timed out")
}
if waitErr != nil {
msg := strings.TrimSpace(stderr.String())
if msg != "" {
if stderr.Truncated() {
msg += "\n…[truncated]…"
}
return "", false, fmt.Errorf("%w: %s", waitErr, msg)
}
return "", false, waitErr
}
return stdout.String(), stdout.Truncated(), nil
}

type limitedBuffer struct {
buf bytes.Buffer
truncated bool
}

func (b *limitedBuffer) Write(p []byte) (int, error) {
remaining := maxFileRefBytes - b.buf.Len()
if remaining > 0 {
if len(p) > remaining {
_, _ = b.buf.Write(p[:remaining])
b.truncated = true
} else {
_, _ = b.buf.Write(p)
}
} else if len(p) > 0 {
b.truncated = true
}
return len(p), nil
}

func (b *limitedBuffer) String() string { return b.buf.String() }

func (b *limitedBuffer) Truncated() bool { return b.truncated }

const pythonPDFExtractScript = `
import sys

path = sys.argv[1]

try:
from pypdf import PdfReader
except Exception:
try:
from PyPDF2 import PdfReader
except Exception:
PdfReader = None

if PdfReader is not None:
reader = PdfReader(path)
for page in reader.pages:
text = page.extract_text() or ""
if text:
print(text)
sys.exit(0)

try:
import pdfplumber
except Exception as exc:
raise SystemExit("no supported Python PDF library found") from exc

with pdfplumber.open(path) as pdf:
for page in pdf.pages:
text = page.extract_text() or ""
if text:
print(text)
`

func imageMime(data []byte, path string) string {
mime := http.DetectContentType(data[:min(len(data), 512)])
if strings.HasPrefix(mime, "image/") {
Expand Down
71 changes: 71 additions & 0 deletions internal/control/refs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,77 @@ func TestReadFileRef(t *testing.T) {
}
}

func TestReadFileRefPDFExtraction(t *testing.T) {
dir := t.TempDir()
pdfPath := filepath.Join(dir, "report.pdf")
if err := os.WriteFile(pdfPath, []byte("%PDF-1.4 fake"), 0o644); err != nil {
t.Fatal(err)
}

oldExtract := extractPDFText
t.Cleanup(func() { extractPDFText = oldExtract })

extractPDFText = func(path string) (pdfExtractResult, error) {
if path != pdfPath {
t.Fatalf("extract path = %q, want %q", path, pdfPath)
}
return pdfExtractResult{text: "Quarterly results\nRevenue up", tool: "test-extractor"}, nil
}
got, isDir, err := readFileRef(pdfPath)
if err != nil || isDir {
t.Fatalf("pdf text = (isDir=%v, err=%v)", isDir, err)
}
if !strings.Contains(got, "PDF text extracted") || !strings.Contains(got, "Revenue up") {
t.Fatalf("pdf text extraction missing from output: %s", got)
}

extractPDFText = func(string) (pdfExtractResult, error) {
return pdfExtractResult{text: " ", tool: "test-extractor"}, nil
}
got, _, err = readFileRef(pdfPath)
if err != nil {
t.Fatalf("empty pdf text err = %v", err)
}
if !strings.Contains(got, "no extractable text") || !strings.Contains(got, "OCR") {
t.Fatalf("empty pdf should ask for OCR, got: %s", got)
}

extractPDFText = func(string) (pdfExtractResult, error) {
return pdfExtractResult{}, os.ErrNotExist
}
got, _, err = readFileRef(pdfPath)
if err != nil {
t.Fatalf("failed pdf text err = %v", err)
}
if !strings.Contains(got, "text extraction unavailable") || !strings.Contains(got, "multimodal/vision") {
t.Fatalf("failed pdf should mention OCR/vision fallback, got: %s", got)
}
}

func TestRunPDFTextCommandCapsStderr(t *testing.T) {
t.Setenv("GO_WANT_PDF_STDERR_HELPER", "1")

_, _, err := runPDFTextCommand(os.Args[0], []string{"-test.run=TestPDFStderrHelperProcess", "--"})
if err == nil {
t.Fatal("expected helper command to fail")
}
msg := err.Error()
if !strings.Contains(msg, "truncated") {
t.Fatalf("expected stderr truncation marker, got: %q", msg)
}
if len(msg) > maxFileRefBytes+1024 {
t.Fatalf("stderr error grew too large: len=%d", len(msg))
}
}

func TestPDFStderrHelperProcess(t *testing.T) {
if os.Getenv("GO_WANT_PDF_STDERR_HELPER") != "1" {
return
}
_, _ = os.Stderr.WriteString(strings.Repeat("x", maxFileRefBytes+4096))
os.Exit(7)
}

func TestResolveBareNamesDuplicates(t *testing.T) {
temp := t.TempDir()

Expand Down
Loading