Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ services:
PAPERLESS_PUBLIC_URL: "http://paperless.mydomain.com" # Optional
MANUAL_TAG: "paperless-gpt" # Optional, default: paperless-gpt
AUTO_TAG: "paperless-gpt-auto" # Optional, default: paperless-gpt-auto
FAIL_TAG: "paperless-gpt-failed" # Optional, default: paperless-gpt-failed. Applied to documents whose update is rejected by paperless-ngx, so they don't get re-processed in a loop. Auto-created at startup.
# LLM Configuration - Choose one:

# Option 1: Standard OpenAI
Expand Down Expand Up @@ -544,6 +545,7 @@ For best results with the enhanced OCR features:
| `PAPERLESS_PUBLIC_URL` | Public URL for Paperless (if different from `PAPERLESS_BASE_URL`). | No | |
| `MANUAL_TAG` | Tag for manual processing. | No | paperless-gpt |
| `AUTO_TAG` | Tag for auto processing. | No | paperless-gpt-auto |
| `FAIL_TAG` | Tag applied to a document when paperless-gpt could not apply the full LLM suggestion. Two cases trigger it: (1) **partial success** — paperless-ngx rejected one or more fields (e.g. an LLM-suggested date in an impossible format such as `2023-01-79`); paperless-gpt drops the rejected fields, retries the update with the rest, and applies this tag so the user knows the document needs review; (2) **hard failure** — the update could not be salvaged; paperless-gpt removes the auto tag (to break the processing loop) and applies this tag. The tag is created automatically in paperless-ngx at startup if it does not exist. | No | paperless-gpt-failed |
| `LLM_PROVIDER` | AI backend (`openai`, `ollama`, `googleai`, `mistral`, or `anthropic`). | Yes | |
| `LLM_MODEL` | AI model name (e.g., `gpt-4o`, `mistral-large-latest`, `qwen3:8b`, `claude-sonnet-4-5`). | Yes | |
| `OPENAI_API_KEY` | OpenAI API key (required if using OpenAI). | Cond. | |
Expand Down
101 changes: 101 additions & 0 deletions background.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import (
"slices"
"strings"
"time"

"gorm.io/gorm"
)

// This is our interface, allowing us to enable proper testing
Expand Down Expand Up @@ -78,6 +80,84 @@ func StartBackgroundTasks(ctx context.Context, app BackgroundProcessor) {
}()
}

// applyFailTagAfterPartialSuccess applies the fail tag to a document whose
// update succeeded only after paperless-gpt had to drop one or more fields
// rejected by paperless-ngx (see UpdateDocuments' strip-and-retry path).
//
// The document's tags in paperless-ngx have already been updated by the
// successful retry to whatever the LLM suggested (the auto tag is no longer
// present). To avoid clobbering those LLM-suggested tags, this function
// re-fetches the document's current state, then PATCHes only the tags field
// to append the fail tag.
//
// This is best-effort: if the re-fetch or the PATCH fails, the dropped-field
// information is logged but the document is left with no fail tag. The loop
// is still broken (the successful retry removed the auto tag) — only the
// user-visible marker is missing.
func applyFailTagAfterPartialSuccess(ctx context.Context, client ClientInterface, db *gorm.DB, documentID int, droppedFields []string) {
docLogger := documentLogger(documentID)
if failTag == "" {
docLogger.Warnf("Document %d update succeeded after paperless-ngx rejected fields %v; no FAIL_TAG is configured, so the document is not marked for review.", documentID, droppedFields)
return
}
currentDoc, err := client.GetDocument(ctx, documentID)
if err != nil {
docLogger.Errorf("Document %d update succeeded after dropping fields %v, but fetching current state to apply fail tag failed: %v", documentID, droppedFields, err)
return
}
if slices.Contains(currentDoc.Tags, failTag) {
docLogger.Warnf("Document %d update succeeded after dropping fields %v; fail tag %q is already present.", documentID, droppedFields, failTag)
return
}
suggestion := DocumentSuggestion{
ID: documentID,
OriginalDocument: currentDoc,
SuggestedTags: []string{failTag},
KeepOriginalTags: true,
}
if err := client.UpdateDocuments(ctx, []DocumentSuggestion{suggestion}, db, false); err != nil {
docLogger.Errorf("Document %d update succeeded after dropping fields %v, but applying fail tag %q failed: %v", documentID, droppedFields, failTag, err)
return
}
docLogger.Warnf("Document %d update succeeded after paperless-ngx rejected fields %v; fail tag %q applied for user review.", documentID, droppedFields, failTag)
}

// recoverFromFailedUpdate is called when an UpdateDocuments call has failed for
// a document picked up by the auto-tagging or auto-OCR poll. It performs a
// minimal tag-only PATCH that removes the auto-tag the document was picked up by
// (so the document is not re-processed on every poll cycle, which can cost
// real money on paid LLM providers) and, if failTag is configured, adds it as
// a marker so the user can find and review failed documents.
//
// The recovery PATCH only manipulates tags and therefore should succeed even
// when the original PATCH was rejected for a field-validation reason (e.g.
// an LLM-suggested date that is not a real calendar date).
//
// On its own failure, this function logs at error level but does not return
// the error to the caller — the caller has already recorded the original
// update failure and the recovery is best-effort.
func recoverFromFailedUpdate(ctx context.Context, client ClientInterface, db *gorm.DB, document Document, removeTag string) {
docLogger := documentLogger(document.ID)
recoveryFields := DocumentSuggestion{
ID: document.ID,
OriginalDocument: document,
RemoveTags: []string{removeTag},
}
if failTag != "" {
recoveryFields.SuggestedTags = []string{failTag}
recoveryFields.KeepOriginalTags = true
}
if err := client.UpdateDocuments(ctx, []DocumentSuggestion{recoveryFields}, db, false); err != nil {
docLogger.Errorf("Recovery update for failed document %d also failed: %v. The %q tag may still be present and the document may be re-processed on the next poll cycle.", document.ID, err, removeTag)
return
}
if failTag != "" {
docLogger.Warnf("Document %d update failed; %q tag removed and %q tag applied to break the processing loop.", document.ID, removeTag, failTag)
} else {
docLogger.Warnf("Document %d update failed; %q tag removed to break the processing loop (no failTag configured).", document.ID, removeTag)
}
}

// processAutoTagDocuments handles the background auto-tagging of documents
func (app *App) processAutoTagDocuments(ctx context.Context) (int, error) {
documents, err := app.Client.GetDocumentsByTag(ctx, autoTag, 25)
Expand Down Expand Up @@ -137,9 +217,21 @@ func (app *App) processAutoTagDocuments(ctx context.Context) (int, error) {

err = app.Client.UpdateDocuments(ctx, suggestions, app.Database, false)
if err != nil {
var partial *PartialUpdateError
if errors.As(err, &partial) {
// Update went through but paperless-ngx rejected some fields,
// which UpdateDocuments dropped in order to land the rest.
// The auto tag is already gone (it was part of the successful
// retry's tag update). Apply the fail tag so the user sees
// that this document needs review.
applyFailTagAfterPartialSuccess(ctx, app.Client, app.Database, partial.DocumentID, partial.DroppedFields)
processedCount++
continue
}
err = fmt.Errorf("error updating document %d: %w", document.ID, err)
docLogger.Error(err.Error())
errs = append(errs, err)
recoverFromFailedUpdate(ctx, app.Client, app.Database, document, autoTag)
continue
}

Expand Down Expand Up @@ -270,8 +362,17 @@ func (app *App) processAutoOcrTagDocuments(ctx context.Context) (int, error) {
documentSuggestion,
}, app.Database, false)
if err != nil {
var partial *PartialUpdateError
if errors.As(err, &partial) {
applyFailTagAfterPartialSuccess(ctx, app.Client, app.Database, partial.DocumentID, partial.DroppedFields)
// Treat as a (partial) success: tag was removed, fail tag applied.
docLogger.Info("Successfully processed document OCR (with partial-update fail-tag marker)")
successCount++
continue
}
docLogger.Errorf("Update after OCR failed: %v", err)
errs = append(errs, fmt.Errorf("document %d update error: %w", document.ID, err))
recoverFromFailedUpdate(ctx, app.Client, app.Database, document, autoOcrTag)
continue
}
}
Expand Down
79 changes: 79 additions & 0 deletions background_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -541,3 +541,82 @@ func TestProcessAutoOcrTagDocuments(t *testing.T) {
})
}
}

// recordingClient is a stub ClientInterface used to verify the loop-break
// recovery PATCH built by recoverFromFailedUpdate.
type recordingClient struct {
*PaperlessClient
calls []DocumentSuggestion
updateErr error // returned by UpdateDocuments while non-nil
failAfterNCalls int // 0 = always succeed, N>0 = fail first N calls then succeed
}

func (r *recordingClient) UpdateDocuments(ctx context.Context, documents []DocumentSuggestion, db *gorm.DB, isUndo bool) error {
for _, d := range documents {
r.calls = append(r.calls, d)
}
if r.failAfterNCalls > 0 {
r.failAfterNCalls--
return r.updateErr
}
return nil
}

// TestRecoverFromFailedUpdate verifies that the recovery PATCH built by
// recoverFromFailedUpdate removes the auto tag (so the document is not
// re-processed on the next poll) and adds the fail tag (so the user can find
// the document later in the UI). This is the loop-break behaviour relied upon
// by both processAutoTagDocuments and processAutoOcrTagDocuments.
func TestRecoverFromFailedUpdate(t *testing.T) {
prevFailTag := failTag
t.Cleanup(func() { failTag = prevFailTag })

doc := Document{ID: 42, Title: "Failing Doc", Tags: []string{"paperless-gpt-auto", "Eingang"}}

t.Run("removes auto tag and adds fail tag", func(t *testing.T) {
failTag = "paperless-gpt-failed"
client := &recordingClient{}

recoverFromFailedUpdate(context.Background(), client, nil, doc, "paperless-gpt-auto")

require.Len(t, client.calls, 1, "recovery should issue exactly one UpdateDocuments call")
got := client.calls[0]
assert.Equal(t, 42, got.ID)
assert.Equal(t, []string{"paperless-gpt-auto"}, got.RemoveTags, "auto tag must be removed to break the loop")
assert.Equal(t, []string{"paperless-gpt-failed"}, got.SuggestedTags, "fail tag must be added as a marker")
assert.True(t, got.KeepOriginalTags, "must keep original tags so we only add the fail tag, not replace all tags")
})

t.Run("works without fail tag — still removes auto tag", func(t *testing.T) {
failTag = ""
client := &recordingClient{}

recoverFromFailedUpdate(context.Background(), client, nil, doc, "paperless-gpt-auto")

require.Len(t, client.calls, 1)
got := client.calls[0]
assert.Equal(t, []string{"paperless-gpt-auto"}, got.RemoveTags, "auto tag must be removed even when fail tag is disabled")
assert.Empty(t, got.SuggestedTags, "no fail tag should be added when failTag is empty")
assert.False(t, got.KeepOriginalTags, "KeepOriginalTags is only meaningful when adding suggested tags")
})

t.Run("logs but does not panic when recovery itself fails", func(t *testing.T) {
failTag = "paperless-gpt-failed"
client := &recordingClient{updateErr: errors.New("paperless unreachable"), failAfterNCalls: 1}

// Should not panic; recovery is best-effort.
recoverFromFailedUpdate(context.Background(), client, nil, doc, "paperless-gpt-auto")

require.Len(t, client.calls, 1, "recovery call should have been attempted")
})

t.Run("works for OCR auto tag too", func(t *testing.T) {
failTag = "paperless-gpt-failed"
client := &recordingClient{}

recoverFromFailedUpdate(context.Background(), client, nil, doc, "paperless-gpt-ocr-auto")

require.Len(t, client.calls, 1)
assert.Equal(t, []string{"paperless-gpt-ocr-auto"}, client.calls[0].RemoveTags)
})
}
15 changes: 15 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ var (
autoTag = os.Getenv("AUTO_TAG")
manualOcrTag = os.Getenv("MANUAL_OCR_TAG") // Not used yet
autoOcrTag = os.Getenv("AUTO_OCR_TAG")
failTag = os.Getenv("FAIL_TAG")
ocrProcessMode = os.Getenv("OCR_PROCESS_MODE")
llmProvider = os.Getenv("LLM_PROVIDER")
llmModel = os.Getenv("LLM_MODEL")
Expand Down Expand Up @@ -162,6 +163,15 @@ func main() {
// Initialize PaperlessClient
client := NewPaperlessClient(paperlessBaseURL, paperlessAPIToken)

// Ensure the fail tag exists in paperless-ngx. paperless-gpt applies this
// tag mechanically when document processing fails (see processAutoTagDocuments),
// so it must be available regardless of the CREATE_NEW_TAGS setting.
// A failure here is logged but non-fatal: the loop-break path still removes
// the auto tag, the fail tag is just not applied.
if err := client.EnsureTagExists(ctx, failTag); err != nil {
log.Warnf("Failed to ensure fail tag %q exists: %v. Recovery from a failed document update will still remove the auto tag (loop break works), but the fail tag will not be added.", failTag, err)
}

// Initial fetch of custom fields
refreshCustomFieldsCache(client)

Expand Down Expand Up @@ -581,6 +591,11 @@ func validateOrDefaultEnvVars() {
autoOcrTag = "paperless-gpt-ocr-auto"
}

if failTag == "" {
failTag = "paperless-gpt-failed"
}
fmt.Printf("Using %s as fail tag\n", failTag)

if pdfOCRCompleteTag == "" {
pdfOCRCompleteTag = "paperless-gpt-ocr-complete"
}
Expand Down
Loading