From d3fa4939d8a5a9e98a6a0b80e1ff8cf6c379c947 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sun, 17 May 2026 11:38:44 +0200 Subject: [PATCH 01/26] fix: preserve owner and document_type when replacing document in uploadProcessedPDF --- ocr.go | 9 +++++++++ paperless.go | 2 ++ types.go | 3 +++ 3 files changed, 14 insertions(+) diff --git a/ocr.go b/ocr.go index 313ed1f6..a25b4369 100644 --- a/ocr.go +++ b/ocr.go @@ -489,6 +489,10 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData "title": originalDoc.Title, } + if originalDoc.Owner != nil { + metadata["owner"] = *originalDoc.Owner + } + // Copy metadata from original document if requested if options.CopyMetadata { // Get tag IDs @@ -535,6 +539,11 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData if originalDoc.CreatedDate != "" { metadata["created"] = originalDoc.CreatedDate } + + // Set document type if available + if originalDoc.DocumentType != 0 { + metadata["document_type"] = originalDoc.DocumentType + } } else if app.pdfOCRTagging { // Even if not copying all metadata, still add the OCR complete tag if tagging is enabled allTags, err := app.Client.GetAllTags(ctx) diff --git a/paperless.go b/paperless.go index 170658db..afa9ca12 100644 --- a/paperless.go +++ b/paperless.go @@ -446,6 +446,8 @@ func (client *PaperlessClient) GetDocument(ctx context.Context, documentID int) OriginalFileName: documentResponse.OriginalFileName, CustomFields: documentResponse.CustomFields, DocumentTypeName: documentTypeName, + DocumentType: documentResponse.DocumentType, + Owner: documentResponse.Owner, }, nil } diff --git a/types.go b/types.go index 6ede90b5..df1a8ed0 100644 --- a/types.go +++ b/types.go @@ -71,6 +71,7 @@ type GetDocumentApiResponse struct { OriginalFileName string `json:"original_file_name"` Notes []interface{} `json:"notes"` CustomFields []CustomFieldResponse `json:"custom_fields"` + Owner *int `json:"owner"` } // Document is a stripped down version of the document object from paperless-ngx. @@ -84,7 +85,9 @@ type Document struct { CreatedDate string `json:"created_date"` OriginalFileName string `json:"original_file_name"` DocumentTypeName string `json:"document_type_name"` + DocumentType int `json:"document_type"` CustomFields []CustomFieldResponse `json:"custom_fields"` + Owner *int `json:"owner"` } // GenerateSuggestionsRequest is the request payload for generating suggestions for /generate-suggestions endpoint From d44450d41bcae91256bd8a149758ccf2018ea1a9 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sun, 17 May 2026 18:52:35 +0200 Subject: [PATCH 02/26] fix: preserve full owner and permissions via PATCH after document upload --- ocr.go | 26 ++++++++++++++++++++++---- paperless.go | 25 ++++++++++++++++++++++++- types.go | 14 ++++++++++++++ 3 files changed, 60 insertions(+), 5 deletions(-) diff --git a/ocr.go b/ocr.go index a25b4369..bff85a3d 100644 --- a/ocr.go +++ b/ocr.go @@ -489,10 +489,6 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData "title": originalDoc.Title, } - if originalDoc.Owner != nil { - metadata["owner"] = *originalDoc.Owner - } - // Copy metadata from original document if requested if options.CopyMetadata { // Get tag IDs @@ -594,6 +590,28 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData if status == "SUCCESS" { logger.Info("Document processing completed successfully") + + // Restore owner and permissions on the new document + if resultMap, ok := taskStatus["result"].(map[string]interface{}); ok { + if newDocIDFloat, ok := resultMap["document_id"].(float64); ok { + newDocID := int(newDocIDFloat) + logger.WithField("new_doc_id", newDocID).Info("Restoring owner and permissions on new document") + + patchFields := make(map[string]interface{}) + if originalDoc.Owner != nil { + patchFields["owner"] = *originalDoc.Owner + } + if originalDoc.Permissions != nil { + patchFields["set_permissions"] = originalDoc.Permissions + } + + if len(patchFields) > 0 { + if err := app.Client.PatchDocument(ctx, newDocID, patchFields); err != nil { + logger.WithError(err).Warn("Failed to patch owner/permissions on new document, continuing") + } + } + } + } break } diff --git a/paperless.go b/paperless.go index afa9ca12..425860e1 100644 --- a/paperless.go +++ b/paperless.go @@ -360,7 +360,7 @@ func (client *PaperlessClient) DownloadPDF(ctx context.Context, document Documen func (client *PaperlessClient) GetDocument(ctx context.Context, documentID int) (Document, error) { // TODO: This function can be optimized by caching the results of GetAllTags, GetAllCorrespondents, and GetCustomFields. // A simple time-based cache could be implemented in the PaperlessClient to avoid fetching this data on every call. - path := fmt.Sprintf("api/documents/%d/", documentID) + path := fmt.Sprintf("api/documents/%d/?full_perms=true", documentID) resp, err := client.Do(ctx, "GET", path, nil) if err != nil { return Document{}, err @@ -448,9 +448,32 @@ func (client *PaperlessClient) GetDocument(ctx context.Context, documentID int) DocumentTypeName: documentTypeName, DocumentType: documentResponse.DocumentType, Owner: documentResponse.Owner, + Permissions: documentResponse.Permissions, }, nil } +// PatchDocument patches a document with the given fields +func (client *PaperlessClient) PatchDocument(ctx context.Context, documentID int, fields map[string]interface{}) error { + jsonData, err := json.Marshal(fields) + if err != nil { + return fmt.Errorf("error marshalling JSON: %w", err) + } + + path := fmt.Sprintf("api/documents/%d/", documentID) + resp, err := client.Do(ctx, "PATCH", path, bytes.NewBuffer(jsonData)) + if err != nil { + return fmt.Errorf("error patching document %d: %w", documentID, err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + bodyBytes, _ := io.ReadAll(resp.Body) + return fmt.Errorf("error patching document %d: %d, %s", documentID, resp.StatusCode, string(bodyBytes)) + } + + return nil +} + // UpdateDocuments updates the specified documents with suggested changes func (client *PaperlessClient) UpdateDocuments(ctx context.Context, documents []DocumentSuggestion, db *gorm.DB, isUndo bool) error { availableTags, err := client.GetAllTags(ctx) diff --git a/types.go b/types.go index df1a8ed0..ef838606 100644 --- a/types.go +++ b/types.go @@ -58,6 +58,18 @@ type CustomFieldSuggestion struct { Value interface{} `json:"value"` } +// PermissionSet defines view/change permissions for users and groups +type PermissionSet struct { + Users []int `json:"users"` + Groups []int `json:"groups"` +} + +// Permissions holds the full permission structure for a document +type Permissions struct { + View PermissionSet `json:"view"` + Change PermissionSet `json:"change"` +} + // GetDocumentApiResponse is the response payload for /documents/{id} endpoint. // But we are only interested in a subset of the fields. type GetDocumentApiResponse struct { @@ -72,6 +84,7 @@ type GetDocumentApiResponse struct { Notes []interface{} `json:"notes"` CustomFields []CustomFieldResponse `json:"custom_fields"` Owner *int `json:"owner"` + Permissions *Permissions `json:"permissions,omitempty"` } // Document is a stripped down version of the document object from paperless-ngx. @@ -88,6 +101,7 @@ type Document struct { DocumentType int `json:"document_type"` CustomFields []CustomFieldResponse `json:"custom_fields"` Owner *int `json:"owner"` + Permissions *Permissions `json:"permissions,omitempty"` } // GenerateSuggestionsRequest is the request payload for generating suggestions for /generate-suggestions endpoint From 949ce8ac080f1400bd154434dcf820e2690965b6 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 13:53:48 +0200 Subject: [PATCH 03/26] feat: add iOS OCR Server provider Add a new OCR provider (ios_ocr) that sends images to the iOS OCR Server app running on iPhone via its /upload API and extracts the recognized text. - New environment variable: IOS_OCR_SERVER_URL (required) and IOS_OCR_SERVER_TIMEOUT (optional, default 60s) - Supports image processing mode only - Returns plain OCR text, mirroring all existing provider behavior - Full test coverage following existing patterns --- main.go | 19 ++++ ocr/iosocr_provider.go | 167 ++++++++++++++++++++++++++++++++++ ocr/iosocr_provider_test.go | 172 ++++++++++++++++++++++++++++++++++++ ocr/provider.go | 11 +++ 4 files changed, 369 insertions(+) create mode 100644 ocr/iosocr_provider.go create mode 100644 ocr/iosocr_provider_test.go diff --git a/main.go b/main.go index a9c68201..43808d96 100644 --- a/main.go +++ b/main.go @@ -83,6 +83,8 @@ var ( doclingImageExportMode = os.Getenv("DOCLING_IMAGE_EXPORT_MODE") doclingOCRPipeline = os.Getenv("DOCLING_OCR_PIPELINE") doclingOCREngine = os.Getenv("DOCLING_OCR_ENGINE") + iosOcrServerURL = os.Getenv("IOS_OCR_SERVER_URL") + iosOcrServerTimeout = os.Getenv("IOS_OCR_SERVER_TIMEOUT") googleThinkingBudget *int32 // Will be parsed from GOOGLEAI_THINKING_BUDGET // Templates @@ -281,6 +283,7 @@ func main() { DoclingImageExportMode: doclingImageExportMode, DoclingOCRPipeline: doclingOCRPipeline, DoclingOCREngine: doclingOCREngine, + IosOcrServerURL: iosOcrServerURL, EnableHOCR: true, // Always generate hOCR struct if provider supports it VisionLLMMaxTokens: visionLlmMaxTokens, VisionLLMTemperature: visionLlmTemperature, @@ -299,6 +302,15 @@ func main() { } } + // Parse iOS OCR Server timeout if set + if iosOcrServerTimeout != "" { + if timeout, err := strconv.Atoi(iosOcrServerTimeout); err == nil { + ocrConfig.IosOcrServerTimeout = timeout + } else { + log.Warnf("Invalid IOS_OCR_SERVER_TIMEOUT value: %v, using default (60)", err) + } + } + // If provider is LLM, but no VISION_LLM_PROVIDER is set, don't initialize OCR provider if providerType == "llm" && visionLlmProvider == "" { log.Warn("OCR provider is set to LLM, but no VISION_LLM_PROVIDER is set. Disabling OCR.") @@ -537,6 +549,7 @@ func validateOCRProviderModeCompatibility(provider, mode, visionProvider string) "google_docai": {"image", "pdf", "whole_pdf"}, // Google Document AI supports all modes "mistral_ocr": {"image", "pdf", "whole_pdf"}, // Mistral OCR supports all modes "docling": {"image", "pdf", "whole_pdf"}, // Docling supports image and PDF modes + "ios_ocr": {"image"}, // iOS OCR Server supports image mode only } // Google Gemini API natively handles PDF documents @@ -639,6 +652,12 @@ func validateOrDefaultEnvVars() { } } + if ocrProvider == "ios_ocr" { + if iosOcrServerURL == "" { + log.Fatal("Please set the IOS_OCR_SERVER_URL environment variable for iOS OCR Server provider") + } + } + if llmModel == "" { log.Fatal("Please set the LLM_MODEL environment variable.") } diff --git a/ocr/iosocr_provider.go b/ocr/iosocr_provider.go new file mode 100644 index 00000000..5cc11348 --- /dev/null +++ b/ocr/iosocr_provider.go @@ -0,0 +1,167 @@ +package ocr + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "mime/multipart" + "net/http" + "strings" + "time" + + "github.com/hashicorp/go-retryablehttp" + "github.com/sirupsen/logrus" +) + +const ( + defaultIosOcrTimeout = 60 +) + +// IosOcrProvider implements OCR using the iOS OCR Server app +type IosOcrProvider struct { + serverURL string + httpClient *retryablehttp.Client +} + +// IosOcrUploadResponse mirrors the JSON response from the iOS OCR Server +type IosOcrUploadResponse struct { + Success bool `json:"success"` + Message string `json:"message"` + OcrResult string `json:"ocr_result"` + ImageWidth int `json:"image_width"` + ImageHeight int `json:"image_height"` + OcrBoxes interface{} `json:"ocr_boxes"` +} + +func newIosOcrProvider(config Config) (*IosOcrProvider, error) { + logger := log.WithFields(logrus.Fields{ + "server_url": config.IosOcrServerURL, + }) + logger.Info("Creating new iOS OCR Server provider") + + if config.IosOcrServerURL == "" { + return nil, fmt.Errorf("missing required iOS OCR Server URL") + } + + timeout := defaultIosOcrTimeout + if config.IosOcrServerTimeout > 0 { + timeout = config.IosOcrServerTimeout + } + + client := retryablehttp.NewClient() + client.RetryMax = 3 + client.RetryWaitMin = 1 * time.Second + client.RetryWaitMax = 10 * time.Second + client.HTTPClient.Timeout = time.Duration(timeout) * time.Second + client.Logger = logger + + // Normalize server URL: strip trailing slash for consistent URL building + serverURL := strings.TrimRight(config.IosOcrServerURL, "/") + + provider := &IosOcrProvider{ + serverURL: serverURL, + httpClient: client, + } + + logger.Info("Successfully initialized iOS OCR Server provider") + return provider, nil +} + +func (p *IosOcrProvider) ProcessImage(ctx context.Context, imageContent []byte, pageNumber int) (*OCRResult, error) { + logger := log.WithFields(logrus.Fields{ + "provider": "ios_ocr", + "url": p.serverURL, + "page": pageNumber, + }) + logger.Debug("Starting iOS OCR Server processing") + + uploadURL := p.serverURL + "/upload" + + // Build multipart form request + var requestBody bytes.Buffer + writer := multipart.NewWriter(&requestBody) + + part, err := writer.CreateFormFile("file", "document.png") + if err != nil { + logger.WithError(err).Error("Failed to create form file") + return nil, fmt.Errorf("failed to create form file: %w", err) + } + + _, err = io.Copy(part, bytes.NewReader(imageContent)) + if err != nil { + logger.WithError(err).Error("Failed to copy image content to form") + return nil, fmt.Errorf("failed to copy image content to form: %w", err) + } + + err = writer.Close() + if err != nil { + logger.WithError(err).Error("Failed to close multipart writer") + return nil, fmt.Errorf("failed to close multipart writer: %w", err) + } + + // Create HTTP request + req, err := retryablehttp.NewRequestWithContext(ctx, "POST", uploadURL, &requestBody) + if err != nil { + logger.WithError(err).Error("Failed to create HTTP request") + return nil, fmt.Errorf("error creating iOS OCR request: %w", err) + } + + req.Header.Set("Content-Type", writer.FormDataContentType()) + req.Header.Set("Accept", "application/json") + + logger.WithField("url", uploadURL).Debug("Sending request to iOS OCR Server") + + resp, err := p.httpClient.Do(req) + if err != nil { + logger.WithError(err).Error("Failed to send request to iOS OCR Server") + return nil, fmt.Errorf("error sending request to iOS OCR Server: %w", err) + } + defer resp.Body.Close() + + respBodyBytes, err := io.ReadAll(resp.Body) + if err != nil { + logger.WithError(err).Error("Failed to read response body") + return nil, fmt.Errorf("error reading iOS OCR response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + logger.WithFields(logrus.Fields{ + "status_code": resp.StatusCode, + "response": string(respBodyBytes), + }).Error("Received non-OK status from iOS OCR Server") + return nil, fmt.Errorf("iOS OCR Server returned status %d: %s", resp.StatusCode, string(respBodyBytes)) + } + + var ocrResp IosOcrUploadResponse + if err := json.Unmarshal(respBodyBytes, &ocrResp); err != nil { + logger.WithError(err).WithField("response", string(respBodyBytes)).Error("Failed to parse iOS OCR JSON response") + return nil, fmt.Errorf("error parsing iOS OCR JSON response: %w", err) + } + + if !ocrResp.Success { + logger.WithFields(logrus.Fields{ + "message": ocrResp.Message, + }).Error("iOS OCR Server returned failure") + return nil, fmt.Errorf("iOS OCR Server processing failed: %s", ocrResp.Message) + } + + result := &OCRResult{ + Text: ocrResp.OcrResult, + Metadata: map[string]string{ + "provider": "ios_ocr", + "has_content": fmt.Sprintf("%t", ocrResp.OcrResult != ""), + "image_width": fmt.Sprintf("%d", ocrResp.ImageWidth), + "image_height": fmt.Sprintf("%d", ocrResp.ImageHeight), + }, + } + + logger.WithFields(logrus.Fields{ + "content_length": len(result.Text), + "image_width": ocrResp.ImageWidth, + "image_height": ocrResp.ImageHeight, + }).Info("Successfully processed image with iOS OCR Server") + + return result, nil +} diff --git a/ocr/iosocr_provider_test.go b/ocr/iosocr_provider_test.go new file mode 100644 index 00000000..c03da91f --- /dev/null +++ b/ocr/iosocr_provider_test.go @@ -0,0 +1,172 @@ +package ocr + +import ( + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "testing" + + "github.com/hashicorp/go-retryablehttp" + "github.com/stretchr/testify/assert" +) + +func setupIosOcrTestServer(t *testing.T, handler http.HandlerFunc) *httptest.Server { + t.Helper() + return httptest.NewServer(handler) +} + +func newTestIosOcrProvider(serverURL string) *IosOcrProvider { + client := retryablehttp.NewClient() + client.RetryMax = 0 + client.Logger = nil + + return &IosOcrProvider{ + serverURL: serverURL, + httpClient: client, + } +} + +func TestIosOcrProvider_ProcessImage(t *testing.T) { + sampleImageContent := []byte("dummy image data") + + tests := []struct { + name string + mockHandler func(w http.ResponseWriter, r *http.Request) + expectedResult *OCRResult + expectedErrStr string + checkRequest func(r *http.Request) + }{ + { + name: "Success Case", + mockHandler: func(w http.ResponseWriter, r *http.Request) { + assert.Equal(t, "/upload", r.URL.Path) + assert.Equal(t, "POST", r.Method) + assert.Contains(t, r.Header.Get("Content-Type"), "multipart/form-data") + assert.Equal(t, "application/json", r.Header.Get("Accept")) + + resp := IosOcrUploadResponse{ + Success: true, + Message: "File uploaded successfully", + OcrResult: "Hello\nWorld", + ImageWidth: 1247, + ImageHeight: 648, + } + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(resp) + }, + expectedResult: &OCRResult{ + Text: "Hello\nWorld", + Metadata: map[string]string{ + "provider": "ios_ocr", + "has_content": "true", + "image_width": "1247", + "image_height": "648", + }, + }, + checkRequest: func(r *http.Request) { + err := r.ParseMultipartForm(10 << 20) + assert.NoError(t, err) + f, fh, err := r.FormFile("file") + assert.NoError(t, err) + assert.NotNil(t, f) + assert.Equal(t, "document.png", fh.Filename) + fileContent, _ := io.ReadAll(f) + assert.Equal(t, sampleImageContent, fileContent) + f.Close() + }, + }, + { + name: "Success Case - Empty OCR Result", + mockHandler: func(w http.ResponseWriter, r *http.Request) { + resp := IosOcrUploadResponse{ + Success: true, + Message: "File uploaded successfully", + OcrResult: "", + } + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(resp) + }, + expectedResult: &OCRResult{ + Text: "", + Metadata: map[string]string{ + "provider": "ios_ocr", + "has_content": "false", + "image_width": "0", + "image_height": "0", + }, + }, + }, + { + name: "Server Returns Failure", + mockHandler: func(w http.ResponseWriter, r *http.Request) { + resp := IosOcrUploadResponse{ + Success: false, + Message: "Error processing image", + } + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(resp) + }, + expectedErrStr: "iOS OCR Server processing failed: Error processing image", + }, + { + name: "Non-OK HTTP Status", + mockHandler: func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + w.Write([]byte("Internal Server Error")) + }, + expectedErrStr: "iOS OCR Server returned status 500", + }, + { + name: "Invalid JSON Response", + mockHandler: func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte("this is not json")) + }, + expectedErrStr: "error parsing iOS OCR JSON response", + }, + { + name: "Server Connection Error", + mockHandler: func(w http.ResponseWriter, r *http.Request) { + }, + expectedErrStr: "connection refused", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + checkedHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if tt.checkRequest != nil { + tt.checkRequest(r) + } + tt.mockHandler(w, r) + }) + + server := setupIosOcrTestServer(t, checkedHandler) + + serverURL := server.URL + if tt.name == "Server Connection Error" { + server.Close() + } + + provider := newTestIosOcrProvider(serverURL) + + if tt.name != "Server Connection Error" { + defer server.Close() + } + + result, err := provider.ProcessImage(context.Background(), sampleImageContent, 1) + + if tt.expectedErrStr != "" { + assert.Error(t, err) + assert.Contains(t, err.Error(), tt.expectedErrStr) + assert.Nil(t, result) + } else { + assert.NoError(t, err) + assert.Equal(t, tt.expectedResult, result) + } + }) + } +} diff --git a/ocr/provider.go b/ocr/provider.go index ea06b5bf..2ff68854 100644 --- a/ocr/provider.go +++ b/ocr/provider.go @@ -78,6 +78,10 @@ type Config struct { DoclingOCRPipeline string // Optional, defaults to "vlm" DoclingOCREngine string // Optional, defaults to "easyocr", if DoclingOCRPipeline == "standard" + // iOS OCR Server settings + IosOcrServerURL string + IosOcrServerTimeout int // seconds, default 60 + // OCR output options EnableHOCR bool // Whether to generate hOCR data if supported by the provider HOCROutputPath string // Where to save hOCR output files @@ -142,6 +146,13 @@ func NewProvider(config Config) (Provider, error) { }).Info("Using Mistral OCR provider") return newMistralOCRProvider(config) + case "ios_ocr": + if config.IosOcrServerURL == "" { + return nil, fmt.Errorf("missing required iOS OCR Server URL (IOS_OCR_SERVER_URL)") + } + log.WithField("url", config.IosOcrServerURL).Info("Using iOS OCR Server provider") + return newIosOcrProvider(config) + default: return nil, fmt.Errorf("unsupported OCR provider: %s", config.Provider) } From 3d14a8c47af9163defde2f0837f478ed378db8cb Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 13:55:50 +0200 Subject: [PATCH 04/26] fix: update Alpine package versions in Dockerfile Bump musl-dev to 1.2.5-r11 and mupdf/mupdf-dev to 1.24.10-r1 to match current Alpine 3.21 repository. --- Dockerfile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index f30094a9..36f78832 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,15 +30,14 @@ FROM docker.io/golang:1.25.5-alpine3.21 AS builder # Set the working directory inside the container WORKDIR /app -# Package versions for Renovate # renovate: datasource=repology depName=alpine_3_21/gcc versioning=loose ENV GCC_VERSION="14.2.0-r4" # renovate: datasource=repology depName=alpine_3_21/musl-dev versioning=loose -ENV MUSL_DEV_VERSION="1.2.5-r9" +ENV MUSL_DEV_VERSION="1.2.5-r11" # renovate: datasource=repology depName=alpine_3_21/mupdf versioning=loose -ENV MUPDF_VERSION="1.24.10-r0" +ENV MUPDF_VERSION="1.24.10-r1" # renovate: datasource=repology depName=alpine_3_21/mupdf-dev versioning=loose -ENV MUPDF_DEV_VERSION="1.24.10-r0" +ENV MUPDF_DEV_VERSION="1.24.10-r1" # renovate: datasource=repology depName=alpine_3_21/sed versioning=loose ENV SED_VERSION="4.9-r2" From 8c0862a37fe8bc3775e84726bc247f5cbf400ce1 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 13:56:47 +0200 Subject: [PATCH 05/26] fix: remove Alpine version pins to fix build Removes pinned package versions from Dockerfile to avoid build failures when Alpine repos update. --- Dockerfile | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/Dockerfile b/Dockerfile index 36f78832..bdc9d193 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,24 +30,13 @@ FROM docker.io/golang:1.25.5-alpine3.21 AS builder # Set the working directory inside the container WORKDIR /app -# renovate: datasource=repology depName=alpine_3_21/gcc versioning=loose -ENV GCC_VERSION="14.2.0-r4" -# renovate: datasource=repology depName=alpine_3_21/musl-dev versioning=loose -ENV MUSL_DEV_VERSION="1.2.5-r11" -# renovate: datasource=repology depName=alpine_3_21/mupdf versioning=loose -ENV MUPDF_VERSION="1.24.10-r1" -# renovate: datasource=repology depName=alpine_3_21/mupdf-dev versioning=loose -ENV MUPDF_DEV_VERSION="1.24.10-r1" -# renovate: datasource=repology depName=alpine_3_21/sed versioning=loose -ENV SED_VERSION="4.9-r2" - -# Install necessary packages with pinned versions +# Install necessary packages RUN apk add --no-cache \ - "gcc=${GCC_VERSION}" \ - "musl-dev=${MUSL_DEV_VERSION}" \ - "mupdf=${MUPDF_VERSION}" \ - "mupdf-dev=${MUPDF_DEV_VERSION}" \ - "sed=${SED_VERSION}" + gcc \ + musl-dev \ + mupdf \ + mupdf-dev \ + sed # Copy go.mod and go.sum files COPY go.mod go.sum ./ From f7c242512a75a7eb775a7f2264e5666166d0e906 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 14:32:01 +0200 Subject: [PATCH 06/26] docs: add iOS OCR Server to README --- README.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3eda3cee..8707a13f 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,7 @@ https://github.com/user-attachments/assets/bd5d38b9-9309-40b9-93ca-918dfa4f3fd4 - **Google Document AI**: Leverage Google's powerful Document AI for OCR tasks. - **Azure Document Intelligence**: Use Microsoft's enterprise OCR solution. - **Docling Server**: Self-hosted OCR and document conversion service + - **iOS OCR Server**: Use Apple's Vision Framework via an iPhone for private, on-device OCR 3. **Automatic Title, Tag & Created Date Generation** No more guesswork. Let the AI do the naming and categorizing. You can easily review suggestions and refine them if needed. @@ -88,6 +89,7 @@ https://github.com/user-attachments/assets/bd5d38b9-9309-40b9-93ca-918dfa4f3fd4 - [2. Azure Document Intelligence](#2-azure-document-intelligence) - [3. Google Document AI](#3-google-document-ai) - [4. Docling Server](#4-docling-server) + - [5. iOS OCR Server](#5-ios-ocr-server) - [OCR Processing Modes](#ocr-processing-modes) - [Image Mode (Default)](#image-mode-default) - [PDF Mode](#pdf-mode) @@ -383,6 +385,24 @@ paperless-gpt supports four different OCR providers, each with unique strengths DOCLING_OCR_ENGINE: "macocr" # Optional, defaults to "easyocr" (only used when `DOCLING_OCR_PIPELINE is set to 'standard') ``` +### 5. iOS OCR Server + +- **Key Features**: + - Uses Apple's Vision Framework via an iPhone for on-device OCR + - 100% local processing, no cloud dependencies, full privacy + - Supports multiple languages with automatic detection + - No API keys or external accounts needed +- **Best For**: + - Users with an iOS device on the same network + - Privacy-sensitive environments + - Quick setup without cloud OCR services +- **Configuration**: + ```yaml + OCR_PROVIDER: "ios_ocr" + IOS_OCR_SERVER_URL: "http://192.168.1.100:8000" + IOS_OCR_SERVER_TIMEOUT: "60" # optional, default 60s + ``` + ## OCR Processing Modes paperless-gpt offers different methods for processing documents, giving you flexibility based on your needs and OCR provider capabilities: @@ -417,6 +437,7 @@ Different OCR providers support different processing modes: | **Google Document AI** | ✅ | ✅ | ✅ | | **Mistral OCR** | ✅ | ✅ | ✅ | | **Docling Server** | ✅ | ✅ | ✅ | +| **iOS OCR Server** | ✅ | ❌ | ❌ | > **Important**: paperless-gpt will validate your configuration at startup and prevent unsupported mode/provider combinations. If you specify an unsupported mode for your provider, the application will fail to start with a clear error message. @@ -558,7 +579,7 @@ For best results with the enhanced OCR features: | `LLM_REQUESTS_PER_MINUTE` | Maximum requests per minute for the main LLM. Useful for managing API costs or local LLM load. | No | 120 | | `LLM_MAX_RETRIES` | Maximum retry attempts for failed main LLM requests. | No | 3 | | `LLM_BACKOFF_MAX_WAIT` | Maximum wait time between retries for the main LLM (e.g., `30s`). | No | 30s | -| `OCR_PROVIDER` | OCR provider to use (`llm`, `azure`, or `google_docai`). | No | llm | +| `OCR_PROVIDER` | OCR provider to use (`llm`, `azure`, `google_docai`, `docling`, `mistral_ocr`, `ios_ocr`). | No | llm | | `OCR_PROCESS_MODE` | Method for processing documents: `image` (convert to images first), `pdf` (process PDF pages directly), or `whole_pdf` (entire PDF at once). | No | image | | `VISION_LLM_PROVIDER` | AI backend for LLM OCR (`openai`, `ollama`, `mistral`, or `anthropic`). Required if OCR_PROVIDER is `llm`. | Cond. | | | `VISION_LLM_MODEL` | Model name for LLM OCR (e.g. `minicpm-v`). Required if OCR_PROVIDER is `llm`. | Cond. | | @@ -582,6 +603,8 @@ For best results with the enhanced OCR features: | `DOCLING_IMAGE_EXPORT_MODE` | Mode for image export. Optional; defaults to `embedded` if unset. | No | embedded | | `DOCLING_OCR_PIPELINE` | Sets the pipeline type. Optional; defaults to `vlm` if unset. | No | vlm | | `DOCLING_OCR_ENGINE` | Sets the ocr engine, if `DOCLING_OCR_PIPELINE` is set to `standard`. Optional; defaults to `easyocr` | No | easyocr | +| `IOS_OCR_SERVER_URL` | URL of the iOS OCR Server (e.g. `http://192.168.1.100:8000`). Required if OCR_PROVIDER is `ios_ocr`. | Cond. | | +| `IOS_OCR_SERVER_TIMEOUT` | HTTP request timeout in seconds for the iOS OCR Server. | No | 60 | | `CREATE_LOCAL_HOCR` | Whether to save hOCR files locally. | No | false | | `LOCAL_HOCR_PATH` | Path where hOCR files will be saved when hOCR generation is enabled. | No | /app/hocr | | `CREATE_LOCAL_PDF` | Whether to save enhanced PDFs locally. | No | false | From d23634f8be18e6a7ea9a84f86ae372ad21dcbf57 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 14:37:48 +0200 Subject: [PATCH 07/26] Revert "fix: remove Alpine version pins to fix build" This reverts commit bc6cbf8875366adae20e76c064b6338a607a67d5. --- Dockerfile | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index bdc9d193..36f78832 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,13 +30,24 @@ FROM docker.io/golang:1.25.5-alpine3.21 AS builder # Set the working directory inside the container WORKDIR /app -# Install necessary packages +# renovate: datasource=repology depName=alpine_3_21/gcc versioning=loose +ENV GCC_VERSION="14.2.0-r4" +# renovate: datasource=repology depName=alpine_3_21/musl-dev versioning=loose +ENV MUSL_DEV_VERSION="1.2.5-r11" +# renovate: datasource=repology depName=alpine_3_21/mupdf versioning=loose +ENV MUPDF_VERSION="1.24.10-r1" +# renovate: datasource=repology depName=alpine_3_21/mupdf-dev versioning=loose +ENV MUPDF_DEV_VERSION="1.24.10-r1" +# renovate: datasource=repology depName=alpine_3_21/sed versioning=loose +ENV SED_VERSION="4.9-r2" + +# Install necessary packages with pinned versions RUN apk add --no-cache \ - gcc \ - musl-dev \ - mupdf \ - mupdf-dev \ - sed + "gcc=${GCC_VERSION}" \ + "musl-dev=${MUSL_DEV_VERSION}" \ + "mupdf=${MUPDF_VERSION}" \ + "mupdf-dev=${MUPDF_DEV_VERSION}" \ + "sed=${SED_VERSION}" # Copy go.mod and go.sum files COPY go.mod go.sum ./ From 9912d85dbe1e69fe1c509a8acef7f3b810ba37d1 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 14:41:39 +0200 Subject: [PATCH 08/26] chore: restore original Dockerfile from upstream --- Dockerfile | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 36f78832..f30094a9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,14 +30,15 @@ FROM docker.io/golang:1.25.5-alpine3.21 AS builder # Set the working directory inside the container WORKDIR /app +# Package versions for Renovate # renovate: datasource=repology depName=alpine_3_21/gcc versioning=loose ENV GCC_VERSION="14.2.0-r4" # renovate: datasource=repology depName=alpine_3_21/musl-dev versioning=loose -ENV MUSL_DEV_VERSION="1.2.5-r11" +ENV MUSL_DEV_VERSION="1.2.5-r9" # renovate: datasource=repology depName=alpine_3_21/mupdf versioning=loose -ENV MUPDF_VERSION="1.24.10-r1" +ENV MUPDF_VERSION="1.24.10-r0" # renovate: datasource=repology depName=alpine_3_21/mupdf-dev versioning=loose -ENV MUPDF_DEV_VERSION="1.24.10-r1" +ENV MUPDF_DEV_VERSION="1.24.10-r0" # renovate: datasource=repology depName=alpine_3_21/sed versioning=loose ENV SED_VERSION="4.9-r2" From 50e16afb7ffa3b0c9fe2dcaa28484671d8508b1d Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 14:43:21 +0200 Subject: [PATCH 09/26] docs: add App Store link for iOS OCR Server --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 8707a13f..ecc68aad 100644 --- a/README.md +++ b/README.md @@ -387,6 +387,8 @@ paperless-gpt supports four different OCR providers, each with unique strengths ### 5. iOS OCR Server +Uses the [OCR Server](https://apps.apple.com/ch/app/ocr-server/id6749533041) iOS app to perform OCR using Apple's Vision Framework. + - **Key Features**: - Uses Apple's Vision Framework via an iPhone for on-device OCR - 100% local processing, no cloud dependencies, full privacy From 48306ba05162ff0a8751920f9ceaa6b3b3484bfb Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 15:35:08 +0200 Subject: [PATCH 10/26] fix: validate timeout > 0 and use bounded reader for iOS OCR responses --- main.go | 6 +++++- ocr/iosocr_provider.go | 17 +++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/main.go b/main.go index 43808d96..f73170c1 100644 --- a/main.go +++ b/main.go @@ -305,7 +305,11 @@ func main() { // Parse iOS OCR Server timeout if set if iosOcrServerTimeout != "" { if timeout, err := strconv.Atoi(iosOcrServerTimeout); err == nil { - ocrConfig.IosOcrServerTimeout = timeout + if timeout > 0 { + ocrConfig.IosOcrServerTimeout = timeout + } else { + log.Warnf("Invalid IOS_OCR_SERVER_TIMEOUT value: %d, must be positive, using default (60)", timeout) + } } else { log.Warnf("Invalid IOS_OCR_SERVER_TIMEOUT value: %v, using default (60)", err) } diff --git a/ocr/iosocr_provider.go b/ocr/iosocr_provider.go index 5cc11348..aca5aa8a 100644 --- a/ocr/iosocr_provider.go +++ b/ocr/iosocr_provider.go @@ -17,6 +17,7 @@ import ( const ( defaultIosOcrTimeout = 60 + maxResponseSize = 1 * 1024 * 1024 // 1MB ) // IosOcrProvider implements OCR using the iOS OCR Server app @@ -120,24 +121,28 @@ func (p *IosOcrProvider) ProcessImage(ctx context.Context, imageContent []byte, } defer resp.Body.Close() - respBodyBytes, err := io.ReadAll(resp.Body) + respBodyBytes, err := io.ReadAll(io.LimitReader(resp.Body, maxResponseSize)) if err != nil { logger.WithError(err).Error("Failed to read response body") return nil, fmt.Errorf("error reading iOS OCR response body: %w", err) } + respSize := len(respBodyBytes) if resp.StatusCode != http.StatusOK { logger.WithFields(logrus.Fields{ - "status_code": resp.StatusCode, - "response": string(respBodyBytes), + "status_code": resp.StatusCode, + "response_size": respSize, }).Error("Received non-OK status from iOS OCR Server") - return nil, fmt.Errorf("iOS OCR Server returned status %d: %s", resp.StatusCode, string(respBodyBytes)) + return nil, fmt.Errorf("iOS OCR Server returned status %d (response size: %d bytes)", resp.StatusCode, respSize) } var ocrResp IosOcrUploadResponse if err := json.Unmarshal(respBodyBytes, &ocrResp); err != nil { - logger.WithError(err).WithField("response", string(respBodyBytes)).Error("Failed to parse iOS OCR JSON response") - return nil, fmt.Errorf("error parsing iOS OCR JSON response: %w", err) + logger.WithError(err).WithFields(logrus.Fields{ + "status_code": resp.StatusCode, + "response_size": respSize, + }).Error("Failed to parse iOS OCR JSON response") + return nil, fmt.Errorf("error parsing iOS OCR JSON response (status: %d, size: %d bytes): %w", resp.StatusCode, respSize, err) } if !ocrResp.Success { From 68b065c1124d49645ebb34c4a6c167357b9881fb Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 15:53:34 +0200 Subject: [PATCH 11/26] fix: replace retryablehttp with plain http.Client to avoid duplicate POST retries Also adds docstring to newIosOcrProvider to satisfy docstring coverage threshold. The background loop in background.go already handles document-level retry via exponential backoff, so HTTP-level retry is redundant and harmful for non-idempotent POST /upload. --- ocr/iosocr_provider.go | 15 ++++++--------- ocr/iosocr_provider_test.go | 7 +------ 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/ocr/iosocr_provider.go b/ocr/iosocr_provider.go index aca5aa8a..e1bb1f9d 100644 --- a/ocr/iosocr_provider.go +++ b/ocr/iosocr_provider.go @@ -11,7 +11,6 @@ import ( "strings" "time" - "github.com/hashicorp/go-retryablehttp" "github.com/sirupsen/logrus" ) @@ -23,7 +22,7 @@ const ( // IosOcrProvider implements OCR using the iOS OCR Server app type IosOcrProvider struct { serverURL string - httpClient *retryablehttp.Client + httpClient *http.Client } // IosOcrUploadResponse mirrors the JSON response from the iOS OCR Server @@ -36,6 +35,7 @@ type IosOcrUploadResponse struct { OcrBoxes interface{} `json:"ocr_boxes"` } +// newIosOcrProvider creates a new IosOcrProvider with the given configuration. func newIosOcrProvider(config Config) (*IosOcrProvider, error) { logger := log.WithFields(logrus.Fields{ "server_url": config.IosOcrServerURL, @@ -51,12 +51,9 @@ func newIosOcrProvider(config Config) (*IosOcrProvider, error) { timeout = config.IosOcrServerTimeout } - client := retryablehttp.NewClient() - client.RetryMax = 3 - client.RetryWaitMin = 1 * time.Second - client.RetryWaitMax = 10 * time.Second - client.HTTPClient.Timeout = time.Duration(timeout) * time.Second - client.Logger = logger + client := &http.Client{ + Timeout: time.Duration(timeout) * time.Second, + } // Normalize server URL: strip trailing slash for consistent URL building serverURL := strings.TrimRight(config.IosOcrServerURL, "/") @@ -103,7 +100,7 @@ func (p *IosOcrProvider) ProcessImage(ctx context.Context, imageContent []byte, } // Create HTTP request - req, err := retryablehttp.NewRequestWithContext(ctx, "POST", uploadURL, &requestBody) + req, err := http.NewRequestWithContext(ctx, "POST", uploadURL, &requestBody) if err != nil { logger.WithError(err).Error("Failed to create HTTP request") return nil, fmt.Errorf("error creating iOS OCR request: %w", err) diff --git a/ocr/iosocr_provider_test.go b/ocr/iosocr_provider_test.go index c03da91f..fbc7cfc6 100644 --- a/ocr/iosocr_provider_test.go +++ b/ocr/iosocr_provider_test.go @@ -8,7 +8,6 @@ import ( "net/http/httptest" "testing" - "github.com/hashicorp/go-retryablehttp" "github.com/stretchr/testify/assert" ) @@ -18,13 +17,9 @@ func setupIosOcrTestServer(t *testing.T, handler http.HandlerFunc) *httptest.Ser } func newTestIosOcrProvider(serverURL string) *IosOcrProvider { - client := retryablehttp.NewClient() - client.RetryMax = 0 - client.Logger = nil - return &IosOcrProvider{ serverURL: serverURL, - httpClient: client, + httpClient: &http.Client{}, } } From 058247860f42e694c5d86cb7a00b8bd4c1a1ad2a Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 18:08:24 +0200 Subject: [PATCH 12/26] feat: add hOCR support for ios_ocr provider MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements the HOCRCapable interface on IosOcrProvider to translate the iOS OCR Server's ocr_boxes array into hOCR Page structures. - Adds IosOcrBox struct for typed box parsing - Adds buildHOCRPage helper: sorts words top-to-bottom, groups into lines by Y-proximity, builds hOCR hierarchy (Page → Lines → Words) - Adds parseOcrBoxes helper for safe interface{} parsing via marshal/unmarshal round-trip - Implements IsHOCREnabled, GetHOCRPages, GetHOCRDocument, ResetHOCR for the HOCRCapable interface - hOCR page creation is gated by config.EnableHOCR (ENABLE_HOCR env var) - Includes unit tests for hOCR page building, box parsing, and end-to-end ProcessImage hOCR flow Follow-up to the base ios_ocr provider PR. --- ocr/iosocr_provider.go | 191 +++++++++++++++++++++++++++++++++++- ocr/iosocr_provider_test.go | 150 ++++++++++++++++++++++++++++ 2 files changed, 340 insertions(+), 1 deletion(-) diff --git a/ocr/iosocr_provider.go b/ocr/iosocr_provider.go index e1bb1f9d..e6006aab 100644 --- a/ocr/iosocr_provider.go +++ b/ocr/iosocr_provider.go @@ -6,11 +6,15 @@ import ( "encoding/json" "fmt" "io" + "math" "mime/multipart" "net/http" + "sort" "strings" + "sync" "time" + "github.com/gardar/ocrchestra/pkg/hocr" "github.com/sirupsen/logrus" ) @@ -23,6 +27,26 @@ const ( type IosOcrProvider struct { serverURL string httpClient *http.Client + enableHOCR bool + mu sync.Mutex + hocrPages []hocr.Page +} + +// IosOcrBox represents a single recognized word with its bounding box +type IosOcrBox struct { + Text string `json:"text"` + X float64 `json:"x"` + Y float64 `json:"y"` + W float64 `json:"w"` + H float64 `json:"h"` + Rect struct { + TopLeftX float64 `json:"topLeft_x"` + TopLeftY float64 `json:"topLeft_y"` + TopRightX float64 `json:"topRight_x"` + TopRightY float64 `json:"topRight_y"` + BottomRightX float64 `json:"bottomRight_x"` + BottomRightY float64 `json:"bottomRight_y"` + } `json:"rect"` } // IosOcrUploadResponse mirrors the JSON response from the iOS OCR Server @@ -61,9 +85,11 @@ func newIosOcrProvider(config Config) (*IosOcrProvider, error) { provider := &IosOcrProvider{ serverURL: serverURL, httpClient: client, + enableHOCR: config.EnableHOCR, + hocrPages: make([]hocr.Page, 0), } - logger.Info("Successfully initialized iOS OCR Server provider") + logger.WithField("enable_hocr", config.EnableHOCR).Info("Successfully initialized iOS OCR Server provider") return provider, nil } @@ -159,6 +185,21 @@ func (p *IosOcrProvider) ProcessImage(ctx context.Context, imageContent []byte, }, } + // Create hOCR page structure if enabled and boxes are available + if p.enableHOCR && ocrResp.OcrBoxes != nil { + boxes, err := parseOcrBoxes(ocrResp.OcrBoxes) + if err != nil { + logger.WithError(err).Warn("Failed to parse OCR boxes for hOCR generation") + } else if len(boxes) > 0 { + hocrPage := buildHOCRPage(boxes, ocrResp.OcrResult, pageNumber, ocrResp.ImageWidth, ocrResp.ImageHeight) + p.mu.Lock() + p.hocrPages = append(p.hocrPages, hocrPage) + p.mu.Unlock() + result.HOCRPage = &hocrPage + logger.WithField("page_number", pageNumber).Info("Created hOCR page") + } + } + logger.WithFields(logrus.Fields{ "content_length": len(result.Text), "image_width": ocrResp.ImageWidth, @@ -167,3 +208,151 @@ func (p *IosOcrProvider) ProcessImage(ctx context.Context, imageContent []byte, return result, nil } + +// --- HOCRCapable interface implementation --- + +// IsHOCREnabled returns whether hOCR generation is enabled +func (p *IosOcrProvider) IsHOCREnabled() bool { + return p.enableHOCR +} + +// GetHOCRPages returns the collected hOCR pages +func (p *IosOcrProvider) GetHOCRPages() []hocr.Page { + p.mu.Lock() + defer p.mu.Unlock() + result := make([]hocr.Page, len(p.hocrPages)) + copy(result, p.hocrPages) + return result +} + +// GetHOCRDocument creates an hOCR document from the collected pages +func (p *IosOcrProvider) GetHOCRDocument() (*hocr.HOCR, error) { + if !p.enableHOCR { + return nil, fmt.Errorf("hOCR generation is not enabled") + } + + p.mu.Lock() + pages := make([]hocr.Page, len(p.hocrPages)) + copy(pages, p.hocrPages) + p.mu.Unlock() + + if len(pages) == 0 { + return nil, fmt.Errorf("no hOCR pages collected") + } + + doc := &hocr.HOCR{ + Title: "iOS OCR Server", + Language: "unknown", + Metadata: map[string]string{ + "ocr-system": "iOS OCR Server (Apple Vision)", + "ocr-number-of-pages": fmt.Sprintf("%d", len(pages)), + "ocr-capabilities": "ocr_page ocr_line ocrx_word", + }, + Pages: pages, + } + return doc, nil +} + +// ResetHOCR clears the collected hOCR pages +func (p *IosOcrProvider) ResetHOCR() { + p.mu.Lock() + defer p.mu.Unlock() + p.hocrPages = make([]hocr.Page, 0) +} + +// --- hOCR helper functions --- + +// parseOcrBoxes attempts to parse the raw ocr_boxes field into a typed slice +func parseOcrBoxes(raw interface{}) ([]IosOcrBox, error) { + jsonData, err := json.Marshal(raw) + if err != nil { + return nil, fmt.Errorf("failed to marshal ocr_boxes: %w", err) + } + var boxes []IosOcrBox + if err := json.Unmarshal(jsonData, &boxes); err != nil { + return nil, fmt.Errorf("failed to unmarshal ocr_boxes: %w", err) + } + return boxes, nil +} + +// buildHOCRPage converts a slice of OCR boxes into an hOCR Page structure. +// Words are sorted top-to-bottom, grouped into lines by Y-coordinate proximity, +// and each word becomes an hOCR word with its bounding box. +func buildHOCRPage(boxes []IosOcrBox, fullText string, pageNumber int, imgWidth, imgHeight int) hocr.Page { + page := hocr.Page{ + ID: fmt.Sprintf("page_%d", pageNumber), + PageNumber: pageNumber, + BBox: hocr.NewBoundingBox(0, 0, float64(imgWidth), float64(imgHeight)), + Metadata: make(map[string]string), + } + + if len(boxes) == 0 { + return page + } + + // Sort boxes top-to-bottom, then left-to-right + sorted := make([]IosOcrBox, len(boxes)) + copy(sorted, boxes) + sort.Slice(sorted, func(i, j int) bool { + if math.Abs(sorted[i].Y-sorted[j].Y) < 1 { + return sorted[i].X < sorted[j].X + } + return sorted[i].Y < sorted[j].Y + }) + + // Group into lines: a new line starts when Y difference > half the current box height + var lines [][]IosOcrBox + currentLine := []IosOcrBox{sorted[0]} + + for i := 1; i < len(sorted); i++ { + prev := sorted[i-1] + curr := sorted[i] + if curr.Y-prev.Y > prev.H*0.5 { + lines = append(lines, currentLine) + currentLine = []IosOcrBox{curr} + } else { + currentLine = append(currentLine, curr) + } + } + lines = append(lines, currentLine) + + // Convert each line group into an hocr.Line + for lidx, lineBoxes := range lines { + // Compute line bounding box from word extremes + minX, minY := math.MaxFloat64, math.MaxFloat64 + maxX, maxY := -math.MaxFloat64, -math.MaxFloat64 + for _, b := range lineBoxes { + if b.X < minX { + minX = b.X + } + if b.Y < minY { + minY = b.Y + } + if b.X+b.W > maxX { + maxX = b.X + b.W + } + if b.Y+b.H > maxY { + maxY = b.Y + b.H + } + } + + line := hocr.Line{ + ID: fmt.Sprintf("line_%d_%d", pageNumber, lidx), + BBox: hocr.NewBoundingBox(minX, minY, maxX, maxY), + Metadata: make(map[string]string), + } + + for widx, b := range lineBoxes { + word := hocr.Word{ + ID: fmt.Sprintf("word_%d_%d_%d", pageNumber, lidx, widx), + Text: b.Text, + BBox: hocr.NewBoundingBox(b.X, b.Y, b.X+b.W, b.Y+b.H), + } + line.Words = append(line.Words, word) + } + + page.Lines = append(page.Lines, line) + } + + return page +} diff --git a/ocr/iosocr_provider_test.go b/ocr/iosocr_provider_test.go index fbc7cfc6..5e4ea22b 100644 --- a/ocr/iosocr_provider_test.go +++ b/ocr/iosocr_provider_test.go @@ -8,6 +8,7 @@ import ( "net/http/httptest" "testing" + "github.com/gardar/ocrchestra/pkg/hocr" "github.com/stretchr/testify/assert" ) @@ -23,6 +24,15 @@ func newTestIosOcrProvider(serverURL string) *IosOcrProvider { } } +func newTestIosOcrProviderWithHOCR(serverURL string) *IosOcrProvider { + return &IosOcrProvider{ + serverURL: serverURL, + httpClient: &http.Client{}, + enableHOCR: true, + hocrPages: make([]hocr.Page, 0), + } +} + func TestIosOcrProvider_ProcessImage(t *testing.T) { sampleImageContent := []byte("dummy image data") @@ -165,3 +175,143 @@ func TestIosOcrProvider_ProcessImage(t *testing.T) { }) } } + +func TestIosOcrProvider_ProcessImage_HOCR(t *testing.T) { + sampleImageContent := []byte("dummy image data") + + tests := []struct { + name string + enableHOCR bool + ocrBoxes interface{} + expectHOCRPage bool + expectedLines int + }{ + { + name: "HOCR Enabled With Boxes", + enableHOCR: true, + ocrBoxes: []IosOcrBox{ + {Text: "Hello", X: 100, Y: 100, W: 200, H: 40}, + {Text: "World", X: 320, Y: 100, W: 180, H: 40}, + {Text: "Second", X: 100, Y: 200, W: 220, H: 40}, + }, + expectHOCRPage: true, + expectedLines: 2, + }, + { + name: "HOCR Enabled Empty Boxes", + enableHOCR: true, + ocrBoxes: []IosOcrBox{}, + expectHOCRPage: false, + expectedLines: 0, + }, + { + name: "HOCR Enabled Nil Boxes", + enableHOCR: true, + ocrBoxes: nil, + expectHOCRPage: false, + expectedLines: 0, + }, + { + name: "HOCR Disabled With Boxes", + enableHOCR: false, + ocrBoxes: []IosOcrBox{ + {Text: "Hello", X: 100, Y: 100, W: 200, H: 40}, + }, + expectHOCRPage: false, + expectedLines: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + resp := IosOcrUploadResponse{ + Success: true, + Message: "OK", + OcrResult: "Hello World\nSecond", + ImageWidth: 1247, + ImageHeight: 648, + OcrBoxes: tt.ocrBoxes, + } + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(resp) + })) + defer server.Close() + + var provider *IosOcrProvider + if tt.enableHOCR { + provider = newTestIosOcrProviderWithHOCR(server.URL) + } else { + provider = newTestIosOcrProvider(server.URL) + } + + result, err := provider.ProcessImage(context.Background(), sampleImageContent, 1) + assert.NoError(t, err) + assert.NotNil(t, result) + + if tt.expectHOCRPage { + assert.NotNil(t, result.HOCRPage) + assert.Equal(t, 1, result.HOCRPage.PageNumber) + assert.Len(t, result.HOCRPage.Lines, tt.expectedLines) + if len(result.HOCRPage.Lines) > 0 { + assert.Len(t, result.HOCRPage.Lines[0].Words, 2) + assert.Equal(t, "Hello", result.HOCRPage.Lines[0].Words[0].Text) + assert.Equal(t, "World", result.HOCRPage.Lines[0].Words[1].Text) + } + } else { + assert.Nil(t, result.HOCRPage) + } + }) + } +} + +func TestBuildHOCRPage(t *testing.T) { + boxes := []IosOcrBox{ + {Text: "Hello", X: 100, Y: 100, W: 200, H: 40}, + {Text: "World", X: 320, Y: 100, W: 180, H: 40}, + {Text: "Bar", X: 50, Y: 200, W: 100, H: 40}, + } + + page := buildHOCRPage(boxes, "Hello World\nBar", 1, 1247, 648) + + assert.Equal(t, "page_1", page.ID) + assert.Equal(t, 1, page.PageNumber) + assert.Equal(t, hocr.BoundingBox{X1: 0, Y1: 0, X2: 1247, Y2: 648}, page.BBox) + assert.Len(t, page.Lines, 2) + + // First line: "Hello World" + assert.Len(t, page.Lines[0].Words, 2) + assert.Equal(t, "Hello", page.Lines[0].Words[0].Text) + assert.Equal(t, "World", page.Lines[0].Words[1].Text) + assert.Equal(t, hocr.BoundingBox{X1: 100, Y1: 100, X2: 500, Y2: 140}, page.Lines[0].BBox) + + // Second line: "Bar" + assert.Len(t, page.Lines[1].Words, 1) + assert.Equal(t, "Bar", page.Lines[1].Words[0].Text) +} + +func TestParseOcrBoxes(t *testing.T) { + raw := []interface{}{ + map[string]interface{}{ + "text": "Hello", + "x": float64(100), + "y": float64(100), + "w": float64(200), + "h": float64(40), + }, + } + + boxes, err := parseOcrBoxes(raw) + assert.NoError(t, err) + assert.Len(t, boxes, 1) + assert.Equal(t, "Hello", boxes[0].Text) + assert.Equal(t, 100.0, boxes[0].X) + + // Test with nil + _, err = parseOcrBoxes(nil) + assert.Error(t, err) + + // Test with invalid data + _, err = parseOcrBoxes("not an array") + assert.Error(t, err) +} From b89f818a15f57e89a47dbde4794f97e851840da9 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 18:30:18 +0200 Subject: [PATCH 13/26] fix: resolve variable shadowing in ocr.go image mode branch The := at line 239 created a local imagePaths variable that shadowed the outer declaration (line 137), causing processed_pages=0 at line 366. This prevented PDF generation in image mode for all hOCR-capable providers. Changed to pre-declare imgPageCount and err, then use = for assignment, so the outer imagePaths is used consistently. This is a behavior change: PDF generation in image mode now works when it was previously silently broken. --- ocr.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocr.go b/ocr.go index bff85a3d..3fae16c3 100644 --- a/ocr.go +++ b/ocr.go @@ -236,7 +236,9 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int, options } } else { // Process pages as images - imagePaths, imgPageCount, err := app.Client.DownloadDocumentAsImages(ctx, documentID, pageLimit) + var imgPageCount int + var err error + imagePaths, imgPageCount, err = app.Client.DownloadDocumentAsImages(ctx, documentID, pageLimit) defer func() { for _, imagePath := range imagePaths { if err := os.Remove(imagePath); err != nil { From 65102b7d5410968b7bfd7596cae2e1fa0a2517a2 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sat, 16 May 2026 18:57:07 +0200 Subject: [PATCH 14/26] docs: update Enhanced OCR Features section for iOS OCR Server hOCR support --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ecc68aad..d9f80958 100644 --- a/README.md +++ b/README.md @@ -394,6 +394,7 @@ Uses the [OCR Server](https://apps.apple.com/ch/app/ocr-server/id6749533041) iOS - 100% local processing, no cloud dependencies, full privacy - Supports multiple languages with automatic detection - No API keys or external accounts needed + - **hOCR support for searchable PDF generation** (see [Enhanced OCR Features](#enhanced-ocr-features)) - **Best For**: - Users with an iOS device on the same network - Privacy-sensitive environments @@ -459,14 +460,14 @@ environment: paperless-gpt includes powerful OCR enhancements that go beyond basic text extraction: -> **Important Note**: The PDF text layer generation and hOCR features are currently **only supported with Google Document AI** as the OCR provider. These features are not available when using LLM-based OCR or Azure Document Intelligence. +> **Important Note**: The PDF text layer generation and hOCR features are currently only supported with **Google Document AI** and **iOS OCR Server** as OCR providers. These features are not available when using LLM-based OCR, Azure Document Intelligence, Mistral OCR, or Docling Server. ### PDF Text Layer Generation - **Searchable & Selectable PDFs**: Creates PDFs with transparent text overlays accurately positioned over each word in the document - **hOCR Integration**: Utilizes hOCR format (HTML-based OCR representation) to maintain precise text positioning - **Document Quality Improvement**: Makes documents both searchable and selectable while preserving the original appearance -- **Google Document AI Required**: These features rely on Google Document AI's ability to generate hOCR data with accurate word positions +- **Google Document AI or iOS OCR Server Required**: These features rely on the provider's ability to generate hOCR data with accurate word positions ### Local File Saving From 04dca66232f4cb084d7ed1c157f977f805961942 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sun, 17 May 2026 11:25:21 +0200 Subject: [PATCH 15/26] fix: address code review feedback - brittle test string, nil guard in parseOcrBoxes, README contradictions --- README.md | 4 ++-- ocr/iosocr_provider.go | 4 ++++ ocr/iosocr_provider_test.go | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index d9f80958..364badb8 100644 --- a/README.md +++ b/README.md @@ -346,8 +346,8 @@ paperless-gpt supports four different OCR providers, each with unique strengths - Strong form field detection - Multi-language support - High accuracy on structured documents - - **Exclusive hOCR generation** for creating searchable PDFs with text layers - - **Only provider that supports** enhanced PDF generation features + - **hOCR generation** for creating searchable PDFs with text layers + - **One of the providers that support** enhanced PDF generation features - **Best For**: - Forms and structured documents - Documents with tables diff --git a/ocr/iosocr_provider.go b/ocr/iosocr_provider.go index e6006aab..2e952785 100644 --- a/ocr/iosocr_provider.go +++ b/ocr/iosocr_provider.go @@ -264,6 +264,10 @@ func (p *IosOcrProvider) ResetHOCR() { // parseOcrBoxes attempts to parse the raw ocr_boxes field into a typed slice func parseOcrBoxes(raw interface{}) ([]IosOcrBox, error) { + if raw == nil { + return nil, fmt.Errorf("ocr_boxes is null") + } + jsonData, err := json.Marshal(raw) if err != nil { return nil, fmt.Errorf("failed to marshal ocr_boxes: %w", err) diff --git a/ocr/iosocr_provider_test.go b/ocr/iosocr_provider_test.go index 5e4ea22b..8b91e478 100644 --- a/ocr/iosocr_provider_test.go +++ b/ocr/iosocr_provider_test.go @@ -136,7 +136,7 @@ func TestIosOcrProvider_ProcessImage(t *testing.T) { name: "Server Connection Error", mockHandler: func(w http.ResponseWriter, r *http.Request) { }, - expectedErrStr: "connection refused", + expectedErrStr: "error sending request to iOS OCR Server", }, } From 8248d97b85085e6eec14386514acb225da96347c Mon Sep 17 00:00:00 2001 From: vistalba Date: Sun, 17 May 2026 18:57:25 +0200 Subject: [PATCH 16/26] fix: add PatchDocument to ClientInterface and mock implementations --- app_llm_test.go | 1 + background_test.go | 4 ++++ types.go | 1 + 3 files changed, 6 insertions(+) diff --git a/app_llm_test.go b/app_llm_test.go index 1f860a1b..2b82846b 100644 --- a/app_llm_test.go +++ b/app_llm_test.go @@ -479,6 +479,7 @@ func (m *mockPaperlessClient) GetTaskStatus(ctx context.Context, taskID string) return nil, nil } func (m *mockPaperlessClient) DeleteDocument(ctx context.Context, documentID int) error { return nil } +func (m *mockPaperlessClient) PatchDocument(ctx context.Context, documentID int, fields map[string]interface{}) error { return nil } func TestGetSuggestedCustomFields(t *testing.T) { // 1. Setup diff --git a/background_test.go b/background_test.go index 47895001..faffdfa5 100644 --- a/background_test.go +++ b/background_test.go @@ -81,6 +81,10 @@ func (m *mockClient) GetDocument(ctx context.Context, documentID int) (Document, return Document{}, fmt.Errorf("document %d not found", documentID) } +func (m *mockClient) PatchDocument(ctx context.Context, documentID int, fields map[string]interface{}) error { + return nil +} + func (m *mockClient) AddDocument(doc Document, tags []string) { m.documents[doc.ID] = doc diff --git a/types.go b/types.go index ef838606..55314372 100644 --- a/types.go +++ b/types.go @@ -189,6 +189,7 @@ type ClientInterface interface { UploadDocument(ctx context.Context, data []byte, filename string, metadata map[string]interface{}) (string, error) GetTaskStatus(ctx context.Context, taskID string) (map[string]interface{}, error) DeleteDocument(ctx context.Context, documentID int) error + PatchDocument(ctx context.Context, documentID int, fields map[string]interface{}) error } // DocumentProcessor defines the interface for processing documents with OCR From 2b57f0a7bcd4501dcae7a5731100a5537ba6d145 Mon Sep 17 00:00:00 2001 From: vistalba Date: Sun, 17 May 2026 19:44:28 +0200 Subject: [PATCH 17/26] fix: decode GetTaskStatus response as array, extract document ID from related_document --- ocr.go | 45 +++++++++++++++++++++++++++++---------------- ocr_test.go | 12 +++++++----- paperless.go | 17 +++++++++++++---- 3 files changed, 49 insertions(+), 25 deletions(-) diff --git a/ocr.go b/ocr.go index 3fae16c3..d6c80855 100644 --- a/ocr.go +++ b/ocr.go @@ -6,6 +6,7 @@ import ( "fmt" "os" "path/filepath" + "strconv" "strings" "time" @@ -594,23 +595,35 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData logger.Info("Document processing completed successfully") // Restore owner and permissions on the new document - if resultMap, ok := taskStatus["result"].(map[string]interface{}); ok { - if newDocIDFloat, ok := resultMap["document_id"].(float64); ok { - newDocID := int(newDocIDFloat) - logger.WithField("new_doc_id", newDocID).Info("Restoring owner and permissions on new document") - - patchFields := make(map[string]interface{}) - if originalDoc.Owner != nil { - patchFields["owner"] = *originalDoc.Owner - } - if originalDoc.Permissions != nil { - patchFields["set_permissions"] = originalDoc.Permissions - } + var newDocID int + found := false + if relatedDocStr, ok := taskStatus["related_document"].(string); ok { + if id, err := strconv.Atoi(relatedDocStr); err == nil { + newDocID = id + found = true + } + } + if !found { + if idFloat, ok := taskStatus["id"].(float64); ok { + newDocID = int(idFloat) + found = true + } + } - if len(patchFields) > 0 { - if err := app.Client.PatchDocument(ctx, newDocID, patchFields); err != nil { - logger.WithError(err).Warn("Failed to patch owner/permissions on new document, continuing") - } + if found { + logger.WithField("new_doc_id", newDocID).Info("Restoring owner and permissions on new document") + + patchFields := make(map[string]interface{}) + if originalDoc.Owner != nil { + patchFields["owner"] = *originalDoc.Owner + } + if originalDoc.Permissions != nil { + patchFields["set_permissions"] = originalDoc.Permissions + } + + if len(patchFields) > 0 { + if err := app.Client.PatchDocument(ctx, newDocID, patchFields); err != nil { + logger.WithError(err).Warn("Failed to patch owner/permissions on new document, continuing") } } } diff --git a/ocr_test.go b/ocr_test.go index 1d39a729..0062db34 100644 --- a/ocr_test.go +++ b/ocr_test.go @@ -197,11 +197,13 @@ func TestUploadProcessedPDF(t *testing.T) { require.Equal(t, mockTaskID, taskID, "Unexpected task ID in status request") w.WriteHeader(http.StatusOK) - json.NewEncoder(w).Encode(map[string]interface{}{ - "status": "SUCCESS", - "task_id": taskID, - "result": map[string]interface{}{ - "document_id": documentID, + json.NewEncoder(w).Encode([]map[string]interface{}{ + { + "id": documentID, + "status": "SUCCESS", + "task_id": taskID, + "related_document": fmt.Sprintf("%d", documentID), + "result": fmt.Sprintf("Success. New document id %d created", documentID), }, }) }) diff --git a/paperless.go b/paperless.go index 425860e1..5fd600e5 100644 --- a/paperless.go +++ b/paperless.go @@ -1342,12 +1342,21 @@ func (client *PaperlessClient) GetTaskStatus(ctx context.Context, taskID string) return nil, fmt.Errorf("error checking task status: %d, %s", resp.StatusCode, string(bodyBytes)) } - var result map[string]interface{} - if err := json.NewDecoder(resp.Body).Decode(&result); err != nil { - return nil, fmt.Errorf("error parsing response: %w", err) + var bodyBytes []byte + bodyBytes, err = io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("error reading task status response: %w", err) + } + + var tasks []map[string]interface{} + if err := json.Unmarshal(bodyBytes, &tasks); err != nil { + return nil, fmt.Errorf("error parsing task status response: %w", err) + } + if len(tasks) == 0 { + return nil, fmt.Errorf("empty task status response") } - return result, nil + return tasks[0], nil } // CreateTag creates a new tag and returns its ID From f6a38467b7ac595fcc2011501c0ae5d40d900466 Mon Sep 17 00:00:00 2001 From: vistalba Date: Mon, 18 May 2026 08:37:22 +0200 Subject: [PATCH 18/26] feat: add PDF_PRESERVE_OWNER_PERMISSIONS env var to preserve document owner and permissions on upload - New env var PDF_PRESERVE_OWNER_PERMISSIONS (bool, default false) - Poll task + PATCH owner/permissions independently of ReplaceOriginal - Deletion still gated by ReplaceOriginal only - Added validation: PreserveOwnerPermissions requires UploadPDF=true - Updated all OCROptions construction sites and tests --- background.go | 11 ++++--- jobs.go | 9 +++--- main.go | 3 ++ ocr.go | 89 ++++++++++++++++++++++++++++++--------------------- ocr_test.go | 54 +++++++++++++++++++++++++------ types.go | 7 ++-- 6 files changed, 114 insertions(+), 59 deletions(-) diff --git a/background.go b/background.go index 6b002d87..955b915c 100644 --- a/background.go +++ b/background.go @@ -211,11 +211,12 @@ func (app *App) processAutoOcrTagDocuments(ctx context.Context) (int, error) { } options := OCROptions{ - UploadPDF: app.pdfUpload, - ReplaceOriginal: app.pdfReplace, - CopyMetadata: app.pdfCopyMetadata, - LimitPages: limitOcrPages, - ProcessMode: app.ocrProcessMode, + UploadPDF: app.pdfUpload, + ReplaceOriginal: app.pdfReplace, + CopyMetadata: app.pdfCopyMetadata, + PreserveOwnerPermissions: app.pdfPreserveOwnerPermissions, + LimitPages: limitOcrPages, + ProcessMode: app.ocrProcessMode, } // Use the DocumentProcessor interface instead of calling the method directly diff --git a/jobs.go b/jobs.go index ed9df27b..13f9ed9b 100644 --- a/jobs.go +++ b/jobs.go @@ -153,10 +153,11 @@ func processJob(app *App, job *Job) { if (options == OCROptions{}) { // Use app defaults if job options are not set options = OCROptions{ - UploadPDF: app.pdfUpload, - ReplaceOriginal: app.pdfReplace, - CopyMetadata: app.pdfCopyMetadata, - LimitPages: limitOcrPages, + UploadPDF: app.pdfUpload, + ReplaceOriginal: app.pdfReplace, + CopyMetadata: app.pdfCopyMetadata, + PreserveOwnerPermissions: app.pdfPreserveOwnerPermissions, + LimitPages: limitOcrPages, } } diff --git a/main.go b/main.go index f73170c1..0de51d67 100644 --- a/main.go +++ b/main.go @@ -76,6 +76,7 @@ var ( pdfUpload = os.Getenv("PDF_UPLOAD") == "true" pdfReplace = os.Getenv("PDF_REPLACE") == "true" pdfCopyMetadata = os.Getenv("PDF_COPY_METADATA") == "true" + pdfPreserveOwnerPermissions = os.Getenv("PDF_PRESERVE_OWNER_PERMISSIONS") == "true" pdfOCRCompleteTag = os.Getenv("PDF_OCR_COMPLETE_TAG") pdfOCRTagging = os.Getenv("PDF_OCR_TAGGING") == "true" pdfSkipExistingOCR = os.Getenv("PDF_SKIP_EXISTING_OCR") == "true" @@ -135,6 +136,7 @@ type App struct { pdfUpload bool // Whether to upload processed PDFs to paperless-ngx pdfReplace bool // Whether to replace original document after upload pdfCopyMetadata bool // Whether to copy metadata from original to uploaded PDF + pdfPreserveOwnerPermissions bool // Whether to restore owner and permissions on the uploaded document pdfOCRCompleteTag string // Tag to add to documents that have been OCR processed pdfOCRTagging bool // Whether to add the OCR complete tag to processed PDFs pdfSkipExistingOCR bool // Whether to skip processing PDFs that already have OCR detected @@ -348,6 +350,7 @@ func main() { pdfUpload: pdfUpload, pdfReplace: pdfReplace, pdfCopyMetadata: pdfCopyMetadata, + pdfPreserveOwnerPermissions: pdfPreserveOwnerPermissions, pdfOCRCompleteTag: pdfOCRCompleteTag, pdfOCRTagging: pdfOCRTagging, pdfSkipExistingOCR: pdfSkipExistingOCR, diff --git a/ocr.go b/ocr.go index d6c80855..699c044a 100644 --- a/ocr.go +++ b/ocr.go @@ -48,6 +48,9 @@ func (app *App) ProcessDocumentOCR(ctx context.Context, documentID int, options if !options.UploadPDF && options.ReplaceOriginal { return nil, fmt.Errorf("invalid OCROptions: cannot set ReplaceOriginal=true when UploadPDF=false") } + if !options.UploadPDF && options.PreserveOwnerPermissions { + return nil, fmt.Errorf("invalid OCROptions: cannot set PreserveOwnerPermissions=true when UploadPDF=false") + } docLogger := documentLogger(documentID) if jobID != "" { @@ -570,68 +573,86 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData logger.WithField("task_id", taskID).Info("PDF uploaded successfully") - // If replacing the original is requested, delete it after upload - if options.ReplaceOriginal { + // Determine if we need to wait for document processing to complete + // (needed for both deletion and owner/permissions restoration) + needPoll := options.ReplaceOriginal || options.PreserveOwnerPermissions + + if needPoll { // Poll for task completion maxRetries := 12 waitTime := 5 * time.Second - logger.Info("Waiting for document processing to complete before deletion...") + logger.Info("Waiting for document processing to complete...") for i := 0; i < maxRetries; i++ { taskStatus, err := app.Client.GetTaskStatus(ctx, taskID) if err != nil { - logger.WithError(err).Warn("Failed to check task status, proceeding with deletion anyway") + logger.WithError(err).Warn("Failed to check task status") break } status, ok := taskStatus["status"].(string) if !ok { - logger.Warn("Could not determine task status, proceeding with deletion anyway") + logger.Warn("Could not determine task status") break } if status == "SUCCESS" { logger.Info("Document processing completed successfully") - // Restore owner and permissions on the new document - var newDocID int - found := false - if relatedDocStr, ok := taskStatus["related_document"].(string); ok { - if id, err := strconv.Atoi(relatedDocStr); err == nil { - newDocID = id - found = true + // Restore owner and permissions on the new document if requested + if options.PreserveOwnerPermissions || options.ReplaceOriginal { + var newDocID int + found := false + if relatedDocStr, ok := taskStatus["related_document"].(string); ok { + if id, err := strconv.Atoi(relatedDocStr); err == nil { + newDocID = id + found = true + } } - } - if !found { - if idFloat, ok := taskStatus["id"].(float64); ok { - newDocID = int(idFloat) - found = true + if !found { + if idFloat, ok := taskStatus["id"].(float64); ok { + newDocID = int(idFloat) + found = true + } } - } - if found { - logger.WithField("new_doc_id", newDocID).Info("Restoring owner and permissions on new document") + if found { + logger.WithField("new_doc_id", newDocID).Info("Restoring owner and permissions on new document") - patchFields := make(map[string]interface{}) - if originalDoc.Owner != nil { - patchFields["owner"] = *originalDoc.Owner - } - if originalDoc.Permissions != nil { - patchFields["set_permissions"] = originalDoc.Permissions - } + patchFields := make(map[string]interface{}) + if originalDoc.Owner != nil { + patchFields["owner"] = *originalDoc.Owner + } + if originalDoc.Permissions != nil { + patchFields["set_permissions"] = originalDoc.Permissions + } - if len(patchFields) > 0 { - if err := app.Client.PatchDocument(ctx, newDocID, patchFields); err != nil { - logger.WithError(err).Warn("Failed to patch owner/permissions on new document, continuing") + if len(patchFields) > 0 { + if err := app.Client.PatchDocument(ctx, newDocID, patchFields); err != nil { + logger.WithError(err).Warn("Failed to patch owner/permissions on new document, continuing") + } } } } + + // Delete original document if replacing + if options.ReplaceOriginal { + if err := app.Client.DeleteDocument(ctx, documentID); err != nil { + return fmt.Errorf("error deleting original document: %w", err) + } + logger.Info("Original document deleted successfully") + } + break } if status == "FAILURE" { - return fmt.Errorf("document processing failed, not deleting original document") + if options.ReplaceOriginal { + return fmt.Errorf("document processing failed, not deleting original document") + } + logger.Warn("Document processing failed, owner/permissions not restored") + break } if i < maxRetries-1 { @@ -639,12 +660,6 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData time.Sleep(waitTime) } } - - // Delete original document - if err := app.Client.DeleteDocument(ctx, documentID); err != nil { - return fmt.Errorf("error deleting original document: %w", err) - } - logger.Info("Original document deleted successfully") } return nil diff --git a/ocr_test.go b/ocr_test.go index 0062db34..898536ef 100644 --- a/ocr_test.go +++ b/ocr_test.go @@ -89,7 +89,7 @@ func TestProcessDocumentOCR_SafetyFeature(t *testing.T) { }, } - for _, tc := range testCases { + for _, tc := range tc { t.Run(tc.name, func(t *testing.T) { // Set global limitOcrPages limitOcrPages = tc.limitPages @@ -228,7 +228,7 @@ func TestUploadProcessedPDF(t *testing.T) { }) // Test cases - testCases := []struct { + tc := []struct { name string options OCROptions expectReplacement bool @@ -238,10 +238,11 @@ func TestUploadProcessedPDF(t *testing.T) { { name: "Upload with metadata copy, no replacement", options: OCROptions{ - UploadPDF: true, - ReplaceOriginal: false, - CopyMetadata: true, - LimitPages: 0, + UploadPDF: true, + ReplaceOriginal: false, + CopyMetadata: true, + PreserveOwnerPermissions: false, + LimitPages: 0, }, expectReplacement: false, expectTagging: true, @@ -250,15 +251,29 @@ func TestUploadProcessedPDF(t *testing.T) { { name: "Upload with replacement", options: OCROptions{ - UploadPDF: true, - ReplaceOriginal: true, - CopyMetadata: true, - LimitPages: 0, + UploadPDF: true, + ReplaceOriginal: true, + CopyMetadata: true, + PreserveOwnerPermissions: false, + LimitPages: 0, }, expectReplacement: true, expectTagging: true, expectMetadataCopy: true, }, + { + name: "Upload with owner permissions preservation, no replacement", + options: OCROptions{ + UploadPDF: true, + ReplaceOriginal: false, + CopyMetadata: false, + PreserveOwnerPermissions: true, + LimitPages: 0, + }, + expectReplacement: false, + expectTagging: false, + expectMetadataCopy: false, + }, } for _, tc := range testCases { @@ -293,6 +308,9 @@ func TestOCROptionsValidation(t *testing.T) { if !opts.UploadPDF && opts.ReplaceOriginal { return fmt.Errorf("invalid OCROptions: cannot set ReplaceOriginal=true when UploadPDF=false") } + if !opts.UploadPDF && opts.PreserveOwnerPermissions { + return fmt.Errorf("invalid OCROptions: cannot set PreserveOwnerPermissions=true when UploadPDF=false") + } return nil } @@ -333,6 +351,22 @@ func TestOCROptionsValidation(t *testing.T) { }, expectError: true, }, + { + name: "Safe: preserve permissions with upload", + options: OCROptions{ + UploadPDF: true, + PreserveOwnerPermissions: true, + }, + expectError: false, + }, + { + name: "Unsafe: preserve permissions without upload", + options: OCROptions{ + UploadPDF: false, + PreserveOwnerPermissions: true, + }, + expectError: true, + }, } for _, tc := range testCases { diff --git a/types.go b/types.go index 55314372..200ddc71 100644 --- a/types.go +++ b/types.go @@ -166,9 +166,10 @@ type Correspondent struct { // OCROptions contains options for the OCR processing type OCROptions struct { - UploadPDF bool // Whether to upload the generated PDF - ReplaceOriginal bool // Whether to delete the original document after uploading - CopyMetadata bool // Whether to copy metadata from the original document + UploadPDF bool // Whether to upload the generated PDF + ReplaceOriginal bool // Whether to delete the original document after uploading + CopyMetadata bool // Whether to copy metadata from the original document + PreserveOwnerPermissions bool // Whether to restore owner and permissions on the uploaded document LimitPages int // Limit on the number of pages to process (0 = no limit) ProcessMode string // OCR processing mode: "image" (default) or "pdf" } From b4eeee8327fa745a309c73872cbba33fd156eef0 Mon Sep 17 00:00:00 2001 From: vistalba Date: Mon, 18 May 2026 09:54:52 +0200 Subject: [PATCH 19/26] feat: async in-memory queue for pending permission restores - Replace synchronous 60s timeout polling with async background queue - New file permissions.go with enqueuePermissionRestore and processPendingPermissionRestores - uploadProcessedPDF enqueues restore task when PreserveOwnerPermissions=true - Background loop processes queue every ~10s, retries indefinitely with 24h expiry - ReplaceOriginal block unchanged (sync poll + PATCH + delete for crash-safety) - extractDocIDFromTask / buildPatchFields / patchNewDocumentPermissions helpers added - PR note: queue is in-memory, lost on pod restart during the upload-to-PATCH window --- background.go | 9 ++++ main.go | 3 ++ ocr.go | 121 +++++++++++++++++++++++++++---------------------- ocr_test.go | 2 +- permissions.go | 99 ++++++++++++++++++++++++++++++++++++++++ types.go | 10 ++++ 6 files changed, 188 insertions(+), 56 deletions(-) create mode 100644 permissions.go diff --git a/background.go b/background.go index 955b915c..585c94db 100644 --- a/background.go +++ b/background.go @@ -45,6 +45,15 @@ func StartBackgroundTasks(ctx context.Context, app BackgroundProcessor) { count += ocrCount } + // Process pending permission restores + if a, ok := app.(*App); ok { + permCount, err := a.processPendingPermissionRestores(ctx) + if err != nil { + return 0, fmt.Errorf("error in processPendingPermissionRestores: %w", err) + } + count += permCount + } + // Run auto-tagging after OCR autoCount, err := app.processAutoTagDocuments(ctx) if err != nil { diff --git a/main.go b/main.go index 0de51d67..bda80380 100644 --- a/main.go +++ b/main.go @@ -140,6 +140,9 @@ type App struct { pdfOCRCompleteTag string // Tag to add to documents that have been OCR processed pdfOCRTagging bool // Whether to add the OCR complete tag to processed PDFs pdfSkipExistingOCR bool // Whether to skip processing PDFs that already have OCR detected + + pendingRestores []PendingPermissionRestore + pendingRestoresMu sync.Mutex } func main() { diff --git a/ocr.go b/ocr.go index 699c044a..66ada93c 100644 --- a/ocr.go +++ b/ocr.go @@ -573,86 +573,45 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData logger.WithField("task_id", taskID).Info("PDF uploaded successfully") - // Determine if we need to wait for document processing to complete - // (needed for both deletion and owner/permissions restoration) - needPoll := options.ReplaceOriginal || options.PreserveOwnerPermissions + // Enqueue async permission restore if requested + // The background loop will poll the task and PATCH when complete + if options.PreserveOwnerPermissions { + app.enqueuePermissionRestore(taskID, documentID, originalDoc.Owner, originalDoc.Permissions) + logger.Info("Queued permission restore for new document") + } - if needPoll { + // If replacing the original is requested, poll for completion and delete + if options.ReplaceOriginal { // Poll for task completion maxRetries := 12 waitTime := 5 * time.Second - logger.Info("Waiting for document processing to complete...") + logger.Info("Waiting for document processing to complete before deletion...") for i := 0; i < maxRetries; i++ { taskStatus, err := app.Client.GetTaskStatus(ctx, taskID) if err != nil { - logger.WithError(err).Warn("Failed to check task status") + logger.WithError(err).Warn("Failed to check task status, proceeding with deletion anyway") break } status, ok := taskStatus["status"].(string) if !ok { - logger.Warn("Could not determine task status") + logger.Warn("Could not determine task status, proceeding with deletion anyway") break } if status == "SUCCESS" { logger.Info("Document processing completed successfully") - // Restore owner and permissions on the new document if requested - if options.PreserveOwnerPermissions || options.ReplaceOriginal { - var newDocID int - found := false - if relatedDocStr, ok := taskStatus["related_document"].(string); ok { - if id, err := strconv.Atoi(relatedDocStr); err == nil { - newDocID = id - found = true - } - } - if !found { - if idFloat, ok := taskStatus["id"].(float64); ok { - newDocID = int(idFloat) - found = true - } - } - - if found { - logger.WithField("new_doc_id", newDocID).Info("Restoring owner and permissions on new document") - - patchFields := make(map[string]interface{}) - if originalDoc.Owner != nil { - patchFields["owner"] = *originalDoc.Owner - } - if originalDoc.Permissions != nil { - patchFields["set_permissions"] = originalDoc.Permissions - } - - if len(patchFields) > 0 { - if err := app.Client.PatchDocument(ctx, newDocID, patchFields); err != nil { - logger.WithError(err).Warn("Failed to patch owner/permissions on new document, continuing") - } - } - } - } - - // Delete original document if replacing - if options.ReplaceOriginal { - if err := app.Client.DeleteDocument(ctx, documentID); err != nil { - return fmt.Errorf("error deleting original document: %w", err) - } - logger.Info("Original document deleted successfully") - } + // Restore owner and permissions on the new document + app.patchNewDocumentPermissions(ctx, taskStatus, originalDoc.Owner, originalDoc.Permissions, logger) break } if status == "FAILURE" { - if options.ReplaceOriginal { - return fmt.Errorf("document processing failed, not deleting original document") - } - logger.Warn("Document processing failed, owner/permissions not restored") - break + return fmt.Errorf("document processing failed, not deleting original document") } if i < maxRetries-1 { @@ -660,7 +619,59 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData time.Sleep(waitTime) } } + + // Delete original document (even if poll timed out — upload was successful) + if err := app.Client.DeleteDocument(ctx, documentID); err != nil { + return fmt.Errorf("error deleting original document: %w", err) + } + logger.Info("Original document deleted successfully") } return nil } + +// extractDocIDFromTask extracts the new document ID from a task status response. +func extractDocIDFromTask(taskStatus map[string]interface{}) (int, bool) { + if relatedDocStr, ok := taskStatus["related_document"].(string); ok { + if id, err := strconv.Atoi(relatedDocStr); err == nil { + return id, true + } + } + if idFloat, ok := taskStatus["id"].(float64); ok { + return int(idFloat), true + } + return 0, false +} + +// buildPatchFields creates the PATCH payload for owner and permissions. +func buildPatchFields(owner *int, permissions *PermissionSet) map[string]interface{} { + fields := make(map[string]interface{}) + if owner != nil { + fields["owner"] = *owner + } + if permissions != nil { + fields["set_permissions"] = permissions + } + return fields +} + +// patchNewDocumentPermissions extracts the new document ID from a task status +// and patches it with the given owner and permissions. +func (app *App) patchNewDocumentPermissions(ctx context.Context, taskStatus map[string]interface{}, owner *int, permissions *PermissionSet, logger *logrus.Entry) { + newDocID, found := extractDocIDFromTask(taskStatus) + if !found { + logger.Warn("Could not determine new document ID from task status") + return + } + + logger.WithField("new_doc_id", newDocID).Info("Restoring owner and permissions on new document") + + patchFields := buildPatchFields(owner, permissions) + if len(patchFields) == 0 { + return + } + + if err := app.Client.PatchDocument(ctx, newDocID, patchFields); err != nil { + logger.WithError(err).Warn("Failed to patch owner/permissions on new document, continuing") + } +} diff --git a/ocr_test.go b/ocr_test.go index 898536ef..43f070a4 100644 --- a/ocr_test.go +++ b/ocr_test.go @@ -276,7 +276,7 @@ func TestUploadProcessedPDF(t *testing.T) { }, } - for _, tc := range testCases { + for _, tc := range tc { t.Run(tc.name, func(t *testing.T) { // Reset tracking variables deleteDocCalled = false diff --git a/permissions.go b/permissions.go new file mode 100644 index 00000000..ccf49ba7 --- /dev/null +++ b/permissions.go @@ -0,0 +1,99 @@ +package main + +import ( + "context" + "time" + + "github.com/sirupsen/logrus" +) + +// enqueuePermissionRestore adds a pending permission restore request to the queue. +func (app *App) enqueuePermissionRestore(taskID string, originalDocID int, owner *int, permissions *PermissionSet) { + app.pendingRestoresMu.Lock() + defer app.pendingRestoresMu.Unlock() + + app.pendingRestores = append(app.pendingRestores, PendingPermissionRestore{ + TaskID: taskID, + OriginalDocID: originalDocID, + Owner: owner, + Permissions: permissions, + CreatedAt: time.Now(), + }) +} + +// processPendingPermissionRestores processes the queue of pending permission restores. +// It is called periodically from the background loop. +func (app *App) processPendingPermissionRestores(ctx context.Context) (int, error) { + app.pendingRestoresMu.Lock() + queue := app.pendingRestores + app.pendingRestores = nil + app.pendingRestoresMu.Unlock() + + if len(queue) == 0 { + return 0, nil + } + + processed := 0 + var remaining []PendingPermissionRestore + + for _, entry := range queue { + // Expire entries older than 24 hours + if time.Since(entry.CreatedAt) > 24*time.Hour { + logrus.WithFields(logrus.Fields{ + "task_id": entry.TaskID, + "original_doc": entry.OriginalDocID, + }).Warn("Permission restore request expired after 24h, dropping") + continue + } + + taskStatus, err := app.Client.GetTaskStatus(ctx, entry.TaskID) + if err != nil { + logrus.WithFields(logrus.Fields{ + "task_id": entry.TaskID, + "original_doc": entry.OriginalDocID, + "error": err, + }).Warn("Failed to check task status for permission restore, will retry") + remaining = append(remaining, entry) + continue + } + + status, ok := taskStatus["status"].(string) + if !ok { + logrus.WithFields(logrus.Fields{ + "task_id": entry.TaskID, + "original_doc": entry.OriginalDocID, + }).Warn("Could not determine task status for permission restore, will retry") + remaining = append(remaining, entry) + continue + } + + switch status { + case "SUCCESS": + logger := logrus.WithFields(logrus.Fields{ + "task_id": entry.TaskID, + "original_doc": entry.OriginalDocID, + }) + app.patchNewDocumentPermissions(ctx, taskStatus, entry.Owner, entry.Permissions, logger) + processed++ + + case "FAILURE": + logrus.WithFields(logrus.Fields{ + "task_id": entry.TaskID, + "original_doc": entry.OriginalDocID, + }).Warn("Document processing failed, permission restore not possible") + + default: + // PENDING or STARTED — retry next cycle + remaining = append(remaining, entry) + } + } + + // Put back remaining entries for next cycle + if len(remaining) > 0 { + app.pendingRestoresMu.Lock() + app.pendingRestores = append(app.pendingRestores, remaining...) + app.pendingRestoresMu.Unlock() + } + + return processed, nil +} diff --git a/types.go b/types.go index 200ddc71..cd4fdfa0 100644 --- a/types.go +++ b/types.go @@ -164,6 +164,16 @@ type Correspondent struct { } `json:"set_permissions"` } +// PendingPermissionRestore represents a queued request to restore owner and permissions +// on a newly uploaded document after its consumption task completes. +type PendingPermissionRestore struct { + TaskID string + OriginalDocID int + Owner *int + Permissions *PermissionSet + CreatedAt time.Time +} + // OCROptions contains options for the OCR processing type OCROptions struct { UploadPDF bool // Whether to upload the generated PDF From a0820a8536a026467a3dfa0df5b4990337ecc98b Mon Sep 17 00:00:00 2001 From: vistalba Date: Mon, 18 May 2026 10:28:49 +0200 Subject: [PATCH 20/26] fix: use correct Permissions type (not PermissionSet) in queue struct and helpers - PendingPermissionRestore.Permissions was *PermissionSet, should be *Permissions - Added missing 'time' import to types.go - Fixed enqueuePermissionRestore, buildPatchFields, patchNewDocumentPermissions signatures --- ocr.go | 4 ++-- permissions.go | 2 +- types.go | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ocr.go b/ocr.go index 66ada93c..43fc88ba 100644 --- a/ocr.go +++ b/ocr.go @@ -644,7 +644,7 @@ func extractDocIDFromTask(taskStatus map[string]interface{}) (int, bool) { } // buildPatchFields creates the PATCH payload for owner and permissions. -func buildPatchFields(owner *int, permissions *PermissionSet) map[string]interface{} { +func buildPatchFields(owner *int, permissions *Permissions) map[string]interface{} { fields := make(map[string]interface{}) if owner != nil { fields["owner"] = *owner @@ -657,7 +657,7 @@ func buildPatchFields(owner *int, permissions *PermissionSet) map[string]interfa // patchNewDocumentPermissions extracts the new document ID from a task status // and patches it with the given owner and permissions. -func (app *App) patchNewDocumentPermissions(ctx context.Context, taskStatus map[string]interface{}, owner *int, permissions *PermissionSet, logger *logrus.Entry) { +func (app *App) patchNewDocumentPermissions(ctx context.Context, taskStatus map[string]interface{}, owner *int, permissions *Permissions, logger *logrus.Entry) { newDocID, found := extractDocIDFromTask(taskStatus) if !found { logger.Warn("Could not determine new document ID from task status") diff --git a/permissions.go b/permissions.go index ccf49ba7..c90c4dbb 100644 --- a/permissions.go +++ b/permissions.go @@ -8,7 +8,7 @@ import ( ) // enqueuePermissionRestore adds a pending permission restore request to the queue. -func (app *App) enqueuePermissionRestore(taskID string, originalDocID int, owner *int, permissions *PermissionSet) { +func (app *App) enqueuePermissionRestore(taskID string, originalDocID int, owner *int, permissions *Permissions) { app.pendingRestoresMu.Lock() defer app.pendingRestoresMu.Unlock() diff --git a/types.go b/types.go index cd4fdfa0..b4ebdc82 100644 --- a/types.go +++ b/types.go @@ -2,6 +2,7 @@ package main import ( "context" + "time" "gorm.io/gorm" ) @@ -170,7 +171,7 @@ type PendingPermissionRestore struct { TaskID string OriginalDocID int Owner *int - Permissions *PermissionSet + Permissions *Permissions CreatedAt time.Time } From 8d1f6650f337636e9aa4afe7331a2ca1a480e0e8 Mon Sep 17 00:00:00 2001 From: vistalba Date: Mon, 18 May 2026 12:28:50 +0200 Subject: [PATCH 21/26] docs: add PDF_PRESERVE_OWNER_PERMISSIONS to README with trade-off note - New env var table entry - Trade-off warning in Metadata Copying Limitations section - Added success log in processPendingPermissionRestores --- README.md | 6 ++++++ permissions.go | 1 + 2 files changed, 7 insertions(+) diff --git a/README.md b/README.md index 364badb8..bf8dfc44 100644 --- a/README.md +++ b/README.md @@ -526,6 +526,11 @@ However, some metadata **cannot** be copied due to paperless-ngx API limitations - Custom fields that might be added by other paperless-ngx plugins - Notes and annotations +> **⚠️ Trade-off: Owner & Permissions Restoration** +> When `PDF_PRESERVE_OWNER_PERMISSIONS` is enabled, paperless-gpt uses an **in-memory queue** to restore the original document's owner and permissions after paperless-ngx finishes processing the upload. +> **If paperless-gpt crashes or restarts** between the upload and the restore, the queue is lost. The new document will retain paperless-ngx's default permissions, which may be less restrictive than the original. +> This is acceptable because the API token holder already has access to all documents, and a future update may add persistent queue storage. + ### Safety Features To prevent accidental creation of incomplete documents, paperless-gpt includes several safety features: @@ -615,6 +620,7 @@ For best results with the enhanced OCR features: | `PDF_UPLOAD` | Whether to upload enhanced PDFs to paperless-ngx. | No | false | | `PDF_REPLACE` | Whether to delete the original document after uploading the enhanced version (DANGEROUS). | No | false | | `PDF_COPY_METADATA` | Whether to copy metadata from the original document to the uploaded PDF. Only applicable when using PDF_UPLOAD. | No | true | +| `PDF_PRESERVE_OWNER_PERMISSIONS` | Whether to restore the original document's owner and permissions on the uploaded PDF. Uses an async background queue that retries until paperless-ngx consumption completes. In-memory; lost on pod restart during the upload-to-restore window. | No | false | | `PDF_OCR_TAGGING` | Whether to add a tag to mark documents as OCR-processed. | No | true | | `PDF_OCR_COMPLETE_TAG` | Tag used to mark documents as OCR-processed. | No | paperless-gpt-ocr-complete | | `PDF_SKIP_EXISTING_OCR` | Whether to skip OCR processing for PDFs that already have OCR. Works with `pdf` and `whole_pdf` processing modes (`OCR_PROCESS_MODE`). | No | false | diff --git a/permissions.go b/permissions.go index c90c4dbb..b8183ddb 100644 --- a/permissions.go +++ b/permissions.go @@ -74,6 +74,7 @@ func (app *App) processPendingPermissionRestores(ctx context.Context) (int, erro "original_doc": entry.OriginalDocID, }) app.patchNewDocumentPermissions(ctx, taskStatus, entry.Owner, entry.Permissions, logger) + logger.Info("Permission restore completed successfully") processed++ case "FAILURE": From 25e56892c9db9cc4dab2c9d79ec93845f067a2db Mon Sep 17 00:00:00 2001 From: vistalba Date: Mon, 18 May 2026 12:55:54 +0200 Subject: [PATCH 22/26] style: gofmt --- main.go | 72 ++++++++++++++++++++++++++--------------------------- ocr_test.go | 34 ++++++++++++------------- types.go | 12 ++++----- 3 files changed, 59 insertions(+), 59 deletions(-) diff --git a/main.go b/main.go index bda80380..6712cdff 100644 --- a/main.go +++ b/main.go @@ -122,24 +122,24 @@ func refreshCustomFieldsCache(client ClientInterface) { // App struct to hold dependencies type App struct { - Client ClientInterface - Database *gorm.DB - LLM llms.Model - VisionLLM llms.Model - ocrProvider ocr.Provider // OCR provider interface - ocrProcessMode string // OCR processing mode: "image" (default), "pdf" or "whole_pdf" - docProcessor DocumentProcessor // Optional: Can be used for mocking - localHOCRPath string // Path for saving hOCR files locally - localPDFPath string // Path for saving PDF files locally - createLocalHOCR bool // Whether to save hOCR files locally - createLocalPDF bool // Whether to create PDF files locally - pdfUpload bool // Whether to upload processed PDFs to paperless-ngx - pdfReplace bool // Whether to replace original document after upload - pdfCopyMetadata bool // Whether to copy metadata from original to uploaded PDF - pdfPreserveOwnerPermissions bool // Whether to restore owner and permissions on the uploaded document - pdfOCRCompleteTag string // Tag to add to documents that have been OCR processed - pdfOCRTagging bool // Whether to add the OCR complete tag to processed PDFs - pdfSkipExistingOCR bool // Whether to skip processing PDFs that already have OCR detected + Client ClientInterface + Database *gorm.DB + LLM llms.Model + VisionLLM llms.Model + ocrProvider ocr.Provider // OCR provider interface + ocrProcessMode string // OCR processing mode: "image" (default), "pdf" or "whole_pdf" + docProcessor DocumentProcessor // Optional: Can be used for mocking + localHOCRPath string // Path for saving hOCR files locally + localPDFPath string // Path for saving PDF files locally + createLocalHOCR bool // Whether to save hOCR files locally + createLocalPDF bool // Whether to create PDF files locally + pdfUpload bool // Whether to upload processed PDFs to paperless-ngx + pdfReplace bool // Whether to replace original document after upload + pdfCopyMetadata bool // Whether to copy metadata from original to uploaded PDF + pdfPreserveOwnerPermissions bool // Whether to restore owner and permissions on the uploaded document + pdfOCRCompleteTag string // Tag to add to documents that have been OCR processed + pdfOCRTagging bool // Whether to add the OCR complete tag to processed PDFs + pdfSkipExistingOCR bool // Whether to skip processing PDFs that already have OCR detected pendingRestores []PendingPermissionRestore pendingRestoresMu sync.Mutex @@ -288,7 +288,7 @@ func main() { DoclingImageExportMode: doclingImageExportMode, DoclingOCRPipeline: doclingOCRPipeline, DoclingOCREngine: doclingOCREngine, - IosOcrServerURL: iosOcrServerURL, + IosOcrServerURL: iosOcrServerURL, EnableHOCR: true, // Always generate hOCR struct if provider supports it VisionLLMMaxTokens: visionLlmMaxTokens, VisionLLMTemperature: visionLlmTemperature, @@ -339,24 +339,24 @@ func main() { // Initialize App with dependencies app := &App{ - Client: client, - Database: database, - LLM: llm, - VisionLLM: visionLlm, - ocrProvider: ocrProvider, - ocrProcessMode: ocrProcessMode, - docProcessor: nil, // App itself implements DocumentProcessor - localHOCRPath: localHOCRPath, - localPDFPath: localPDFPath, - createLocalHOCR: createLocalHOCR, - createLocalPDF: createLocalPDF, - pdfUpload: pdfUpload, - pdfReplace: pdfReplace, - pdfCopyMetadata: pdfCopyMetadata, + Client: client, + Database: database, + LLM: llm, + VisionLLM: visionLlm, + ocrProvider: ocrProvider, + ocrProcessMode: ocrProcessMode, + docProcessor: nil, // App itself implements DocumentProcessor + localHOCRPath: localHOCRPath, + localPDFPath: localPDFPath, + createLocalHOCR: createLocalHOCR, + createLocalPDF: createLocalPDF, + pdfUpload: pdfUpload, + pdfReplace: pdfReplace, + pdfCopyMetadata: pdfCopyMetadata, pdfPreserveOwnerPermissions: pdfPreserveOwnerPermissions, - pdfOCRCompleteTag: pdfOCRCompleteTag, - pdfOCRTagging: pdfOCRTagging, - pdfSkipExistingOCR: pdfSkipExistingOCR, + pdfOCRCompleteTag: pdfOCRCompleteTag, + pdfOCRTagging: pdfOCRTagging, + pdfSkipExistingOCR: pdfSkipExistingOCR, } if app.isOcrEnabled() { diff --git a/ocr_test.go b/ocr_test.go index 43f070a4..1bd16874 100644 --- a/ocr_test.go +++ b/ocr_test.go @@ -89,7 +89,7 @@ func TestProcessDocumentOCR_SafetyFeature(t *testing.T) { }, } - for _, tc := range tc { + for _, tc := range tc { t.Run(tc.name, func(t *testing.T) { // Set global limitOcrPages limitOcrPages = tc.limitPages @@ -199,11 +199,11 @@ func TestUploadProcessedPDF(t *testing.T) { w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode([]map[string]interface{}{ { - "id": documentID, - "status": "SUCCESS", - "task_id": taskID, - "related_document": fmt.Sprintf("%d", documentID), - "result": fmt.Sprintf("Success. New document id %d created", documentID), + "id": documentID, + "status": "SUCCESS", + "task_id": taskID, + "related_document": fmt.Sprintf("%d", documentID), + "result": fmt.Sprintf("Success. New document id %d created", documentID), }, }) }) @@ -432,8 +432,8 @@ func TestOCRDetectionBehavior(t *testing.T) { t.Run(tc.name, func(t *testing.T) { // Create a test environment with controlled PDF processing mockApp := &App{ - ocrProcessMode: tc.ocrMode, - pdfSkipExistingOCR: tc.pdfSkipExistingOCR, + ocrProcessMode: tc.ocrMode, + pdfSkipExistingOCR: tc.pdfSkipExistingOCR, } // Mock the pdfocr.DetectOCR function using monkey patching or a test stub @@ -442,21 +442,21 @@ func TestOCRDetectionBehavior(t *testing.T) { // Override the relevant conditional check to track if OCR detection would be performed // This is a simplified way to test the behavior without actually processing PDFs shouldCheck := false - + if mockApp.pdfSkipExistingOCR && (tc.ocrMode == "pdf" || tc.ocrMode == "whole_pdf") { - shouldCheck = true - ocrDetectionCalled = true + shouldCheck = true + ocrDetectionCalled = true } // Verify the OCR detection behavior - assert.Equal(t, tc.shouldCheckOCR, shouldCheck, - "OCR detection behavior doesn't match expected for mode=%s, skipExistingOCR=%v", - tc.ocrMode, tc.pdfSkipExistingOCR) - + assert.Equal(t, tc.shouldCheckOCR, shouldCheck, + "OCR detection behavior doesn't match expected for mode=%s, skipExistingOCR=%v", + tc.ocrMode, tc.pdfSkipExistingOCR) + if tc.shouldCheckOCR { - assert.True(t, ocrDetectionCalled, "OCR detection should be performed") + assert.True(t, ocrDetectionCalled, "OCR detection should be performed") } else { - assert.False(t, ocrDetectionCalled, "OCR detection should not be performed") + assert.False(t, ocrDetectionCalled, "OCR detection should not be performed") } }) } diff --git a/types.go b/types.go index b4ebdc82..db017977 100644 --- a/types.go +++ b/types.go @@ -177,12 +177,12 @@ type PendingPermissionRestore struct { // OCROptions contains options for the OCR processing type OCROptions struct { - UploadPDF bool // Whether to upload the generated PDF - ReplaceOriginal bool // Whether to delete the original document after uploading - CopyMetadata bool // Whether to copy metadata from the original document - PreserveOwnerPermissions bool // Whether to restore owner and permissions on the uploaded document - LimitPages int // Limit on the number of pages to process (0 = no limit) - ProcessMode string // OCR processing mode: "image" (default) or "pdf" + UploadPDF bool // Whether to upload the generated PDF + ReplaceOriginal bool // Whether to delete the original document after uploading + CopyMetadata bool // Whether to copy metadata from the original document + PreserveOwnerPermissions bool // Whether to restore owner and permissions on the uploaded document + LimitPages int // Limit on the number of pages to process (0 = no limit) + ProcessMode string // OCR processing mode: "image" (default) or "pdf" } // ClientInterface defines the interface for PaperlessClient operations From 1fa994d0c957132276a7d52a694c71c9b908b68c Mon Sep 17 00:00:00 2001 From: vistalba Date: Mon, 18 May 2026 13:15:33 +0200 Subject: [PATCH 23/26] fix: address CodeRabbit review findings - Rename tc to testCases in ocr_test.go for clarity - Remove taskStatus[id] fallback in extractDocIDFromTask - Make patchNewDocumentPermissions return error for retry logic - Sort hocrPages by PageNumber in GetHOCRDocument - Fix 'four different OCR providers' in README --- README.md | 2 +- ocr.go | 29 ++++++++++++++++++----------- ocr/iosocr_provider.go | 4 ++++ ocr_test.go | 6 +++--- permissions.go | 10 +++++++--- 5 files changed, 33 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index bf8dfc44..42530666 100644 --- a/README.md +++ b/README.md @@ -295,7 +295,7 @@ For detailed provider-specific documentation: - [Mistral AI Integration](docs/mistral_llm.md) -paperless-gpt supports four different OCR providers, each with unique strengths and capabilities: +paperless-gpt supports multiple OCR providers, each with unique strengths and capabilities: ### 1. LLM-based OCR (Default) diff --git a/ocr.go b/ocr.go index 43fc88ba..f5b02051 100644 --- a/ocr.go +++ b/ocr.go @@ -605,7 +605,8 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData logger.Info("Document processing completed successfully") // Restore owner and permissions on the new document - app.patchNewDocumentPermissions(ctx, taskStatus, originalDoc.Owner, originalDoc.Permissions, logger) + // Error is logged by patchNewDocumentPermissions — still proceed with deletion + _ = app.patchNewDocumentPermissions(ctx, taskStatus, originalDoc.Owner, originalDoc.Permissions, logger) break } @@ -631,15 +632,18 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData } // extractDocIDFromTask extracts the new document ID from a task status response. +// Only related_document is a valid source — taskStatus["id"] is the task UUID, not a document ID. func extractDocIDFromTask(taskStatus map[string]interface{}) (int, bool) { - if relatedDocStr, ok := taskStatus["related_document"].(string); ok { - if id, err := strconv.Atoi(relatedDocStr); err == nil { - return id, true + if relatedDoc, ok := taskStatus["related_document"]; ok { + switch v := relatedDoc.(type) { + case string: + if id, err := strconv.Atoi(v); err == nil { + return id, true + } + case float64: + return int(v), true } } - if idFloat, ok := taskStatus["id"].(float64); ok { - return int(idFloat), true - } return 0, false } @@ -656,22 +660,25 @@ func buildPatchFields(owner *int, permissions *Permissions) map[string]interface } // patchNewDocumentPermissions extracts the new document ID from a task status -// and patches it with the given owner and permissions. -func (app *App) patchNewDocumentPermissions(ctx context.Context, taskStatus map[string]interface{}, owner *int, permissions *Permissions, logger *logrus.Entry) { +// and patches it with the given owner and permissions. Returns an error if the +// patch fails so callers can decide whether to retry. +func (app *App) patchNewDocumentPermissions(ctx context.Context, taskStatus map[string]interface{}, owner *int, permissions *Permissions, logger *logrus.Entry) error { newDocID, found := extractDocIDFromTask(taskStatus) if !found { logger.Warn("Could not determine new document ID from task status") - return + return fmt.Errorf("could not determine new document ID from task status") } logger.WithField("new_doc_id", newDocID).Info("Restoring owner and permissions on new document") patchFields := buildPatchFields(owner, permissions) if len(patchFields) == 0 { - return + return nil } if err := app.Client.PatchDocument(ctx, newDocID, patchFields); err != nil { logger.WithError(err).Warn("Failed to patch owner/permissions on new document, continuing") + return err } + return nil } diff --git a/ocr/iosocr_provider.go b/ocr/iosocr_provider.go index 2e952785..595e48a8 100644 --- a/ocr/iosocr_provider.go +++ b/ocr/iosocr_provider.go @@ -236,6 +236,10 @@ func (p *IosOcrProvider) GetHOCRDocument() (*hocr.HOCR, error) { copy(pages, p.hocrPages) p.mu.Unlock() + sort.Slice(pages, func(i, j int) bool { + return pages[i].PageNumber < pages[j].PageNumber + }) + if len(pages) == 0 { return nil, fmt.Errorf("no hOCR pages collected") } diff --git a/ocr_test.go b/ocr_test.go index 1bd16874..91717c1a 100644 --- a/ocr_test.go +++ b/ocr_test.go @@ -89,7 +89,7 @@ func TestProcessDocumentOCR_SafetyFeature(t *testing.T) { }, } - for _, tc := range tc { + for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { // Set global limitOcrPages limitOcrPages = tc.limitPages @@ -228,7 +228,7 @@ func TestUploadProcessedPDF(t *testing.T) { }) // Test cases - tc := []struct { + testCases := []struct { name string options OCROptions expectReplacement bool @@ -276,7 +276,7 @@ func TestUploadProcessedPDF(t *testing.T) { }, } - for _, tc := range tc { + for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { // Reset tracking variables deleteDocCalled = false diff --git a/permissions.go b/permissions.go index b8183ddb..a8ea41f8 100644 --- a/permissions.go +++ b/permissions.go @@ -73,9 +73,13 @@ func (app *App) processPendingPermissionRestores(ctx context.Context) (int, erro "task_id": entry.TaskID, "original_doc": entry.OriginalDocID, }) - app.patchNewDocumentPermissions(ctx, taskStatus, entry.Owner, entry.Permissions, logger) - logger.Info("Permission restore completed successfully") - processed++ + if err := app.patchNewDocumentPermissions(ctx, taskStatus, entry.Owner, entry.Permissions, logger); err != nil { + logger.WithError(err).Warn("Permission restore failed, will retry") + remaining = append(remaining, entry) + } else { + logger.Info("Permission restore completed successfully") + processed++ + } case "FAILURE": logrus.WithFields(logrus.Fields{ From 957f65daffde0bbeacacb0cc3ed4a504ec35fde8 Mon Sep 17 00:00:00 2001 From: vistalba Date: Mon, 18 May 2026 15:25:52 +0200 Subject: [PATCH 24/26] fix: guard ReplaceOriginal permission restore with PreserveOwnerPermissions flag; distinguish mock task id from document id in test --- ocr.go | 6 +++--- ocr_test.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ocr.go b/ocr.go index f5b02051..1f5215e0 100644 --- a/ocr.go +++ b/ocr.go @@ -604,9 +604,9 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData if status == "SUCCESS" { logger.Info("Document processing completed successfully") - // Restore owner and permissions on the new document - // Error is logged by patchNewDocumentPermissions — still proceed with deletion - _ = app.patchNewDocumentPermissions(ctx, taskStatus, originalDoc.Owner, originalDoc.Permissions, logger) + if options.PreserveOwnerPermissions { + _ = app.patchNewDocumentPermissions(ctx, taskStatus, originalDoc.Owner, originalDoc.Permissions, logger) + } break } diff --git a/ocr_test.go b/ocr_test.go index 91717c1a..b85e4d17 100644 --- a/ocr_test.go +++ b/ocr_test.go @@ -199,7 +199,7 @@ func TestUploadProcessedPDF(t *testing.T) { w.WriteHeader(http.StatusOK) json.NewEncoder(w).Encode([]map[string]interface{}{ { - "id": documentID, + "id": mockTaskID, "status": "SUCCESS", "task_id": taskID, "related_document": fmt.Sprintf("%d", documentID), From 19fdec5189fc9c27612b389ba55f644da6f2bd81 Mon Sep 17 00:00:00 2001 From: vistalba Date: Mon, 18 May 2026 19:28:57 +0200 Subject: [PATCH 25/26] fix: increase ReplaceOriginal polling timeout to ~1 hour Raise maxRetries from 12 to 720 (5s intervals) so that document processing has up to ~1 hour to complete before giving up on deleting the original. This accommodates slower paperless-ngx processing common with mobile uploads, AI-based classification, or large documents. --- ocr.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocr.go b/ocr.go index 1f5215e0..b35fef1a 100644 --- a/ocr.go +++ b/ocr.go @@ -583,7 +583,7 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData // If replacing the original is requested, poll for completion and delete if options.ReplaceOriginal { // Poll for task completion - maxRetries := 12 + maxRetries := 720 waitTime := 5 * time.Second logger.Info("Waiting for document processing to complete before deletion...") From d486e452e9df95b56e6e628834146a0538ed172f Mon Sep 17 00:00:00 2001 From: vistalba Date: Mon, 18 May 2026 17:43:29 +0200 Subject: [PATCH 26/26] fix: only delete original document on confirmed SUCCESS Previously, ReplaceOriginal would delete the original document even if the processing task timed out or returned an unknown status. This could cause data loss in edge cases where the new document was not fully processed. Now a deleteOriginal flag is set only on confirmed SUCCESS. The original document is preserved and an error is returned if processing did not reach SUCCESS. --- ocr.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ocr.go b/ocr.go index b35fef1a..7dd3bf87 100644 --- a/ocr.go +++ b/ocr.go @@ -588,21 +588,23 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData logger.Info("Waiting for document processing to complete before deletion...") + deleteOriginal := false for i := 0; i < maxRetries; i++ { taskStatus, err := app.Client.GetTaskStatus(ctx, taskID) if err != nil { - logger.WithError(err).Warn("Failed to check task status, proceeding with deletion anyway") + logger.WithError(err).Warn("Failed to check task status, keeping original document") break } status, ok := taskStatus["status"].(string) if !ok { - logger.Warn("Could not determine task status, proceeding with deletion anyway") + logger.Warn("Could not determine task status, keeping original document") break } if status == "SUCCESS" { logger.Info("Document processing completed successfully") + deleteOriginal = true if options.PreserveOwnerPermissions { _ = app.patchNewDocumentPermissions(ctx, taskStatus, originalDoc.Owner, originalDoc.Permissions, logger) @@ -621,7 +623,9 @@ func (app *App) uploadProcessedPDF(ctx context.Context, documentID int, pdfData } } - // Delete original document (even if poll timed out — upload was successful) + if !deleteOriginal { + return fmt.Errorf("document %d was not deleted: processing did not reach SUCCESS", documentID) + } if err := app.Client.DeleteDocument(ctx, documentID); err != nil { return fmt.Errorf("error deleting original document: %w", err) }