icereed · BieggerM · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -65,6 +65,7 @@ COPY --from=frontend /app/dist /app/web-app/dist
 # Copy the Go source files
 COPY *.go .
 COPY ocr ./ocr
+COPY sanitize ./sanitize
 
 # Import ARGs from top level
 ARG VERSION

diff --git a/app_http_handlers.go b/app_http_handlers.go
@@ -14,6 +14,8 @@ import (
 	"text/template"
 	"time"
 
+	"paperless-gpt/sanitize"
+
 	"github.com/Masterminds/sprig/v3"
 	"github.com/gin-gonic/gin"
 )
@@ -606,6 +608,7 @@ func (app *App) analyzeDocumentsHandler(c *gin.Context) {
 			log.Errorf("Error fetching document %d: %v", docID, err)
 			return
 		}
+		doc.Content = sanitize.Sanitize(doc.Content)
 		documents = append(documents, doc)
 	}
 

diff --git a/app_llm.go b/app_llm.go
@@ -12,6 +12,8 @@ import (
 
 	_ "image/jpeg"
 
+	"paperless-gpt/sanitize"
+
 	"github.com/sirupsen/logrus"
 	"github.com/tmc/langchaingo/llms"
 )
@@ -399,7 +401,7 @@ func (app *App) getSuggestedCustomFields(ctx context.Context, doc Document, sele
 		return nil, fmt.Errorf("error calculating available tokens for custom fields: %v", err)
 	}
 
-	truncatedContent, err := truncateContentByTokens(doc.Content, availableTokens)
+	truncatedContent, err := truncateContentByTokens(sanitize.Sanitize(doc.Content), availableTokens)
 	if err != nil {
 		return nil, fmt.Errorf("error truncating content for custom fields: %v", err)
 	}
@@ -527,7 +529,7 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque
 			startTime := time.Now()
 			docLogger.Printf("Processing Document ID %d...", documentID)
 
-			content := doc.Content
+			content := sanitize.Sanitize(doc.Content)
 			suggestedTitle := doc.Title
 			var suggestedTags []string
 			var suggestedCorrespondent string

diff --git a/main.go b/main.go
@@ -7,6 +7,7 @@ import (
 	"net/http"
 	"os"
 	"paperless-gpt/ocr"
+	"paperless-gpt/sanitize"
 	"path/filepath"
 	"runtime"
 	"strconv"
@@ -148,6 +149,11 @@ func main() {
 	// Initialize logrus logger
 	initLogger()
 
+	// Initialize content sanitization
+	if err := sanitize.Init(); err != nil {
+		log.Fatalf("Failed to initialize sanitization: %v", err)
+	}
+
 	// Load settings from file
 	loadSettings()
 

diff --git a/ocr/llm_provider.go b/ocr/llm_provider.go
@@ -18,6 +18,7 @@ import (
 	"github.com/tmc/langchaingo/llms/mistral"
 	"github.com/tmc/langchaingo/llms/ollama"
 	"github.com/tmc/langchaingo/llms/openai"
+	"paperless-gpt/sanitize"
 )
 
 // LLMProvider implements OCR using LLM vision models
@@ -135,7 +136,7 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte, pag
 
 	parts = []llms.ContentPart{
 		contentPart,
-		llms.TextPart(p.prompt),
+		llms.TextPart(sanitize.Sanitize(p.prompt)),
 	}
 
 	var callOpts []llms.CallOption

diff --git a/sanitize/doc.go b/sanitize/doc.go
@@ -0,0 +1,14 @@
+// Package sanitize provides content sanitization functionality for removing sensitive
+// strings and patterns before sending content to LLMs.
+//
+// Configuration is done via environment variables:
+//   - REMOVE_FROM_CONTENT: Comma-separated list of literal strings to remove
+//   - REMOVE_FROM_CONTENT_REGEX: Semicolon-separated list of regex patterns to remove
+//
+// Example usage:
+//
+//	if err := sanitize.Init(); err != nil {
+//	    log.Fatal(err)
+//	}
+//	cleanContent := sanitize.Sanitize(dirtyContent)
+package sanitize
diff --git a/sanitize/sanitize.go b/sanitize/sanitize.go
@@ -0,0 +1,94 @@
+package sanitize
+
+import (
+	"fmt"
+	"os"
+	"regexp"
+	"strings"
+	"sync"
+)
+
+var (
+	literalPatterns []string
+	regexPatterns   []*regexp.Regexp
+	initOnce        sync.Once
+	initErr         error
+)
+
+// Init initializes sanitization patterns from environment variables
+func Init() error {
+	initOnce.Do(func() {
+		// Parse literal patterns
+		if literals := os.Getenv("REMOVE_FROM_CONTENT"); literals != "" {
+			literalPatterns = parseCommaSeparated(literals)
+		}
+
+		// Parse regex patterns
+		if regexStr := os.Getenv("REMOVE_FROM_CONTENT_REGEX"); regexStr != "" {
+			patterns := parseSemicolonSeparated(regexStr)
+			for _, pattern := range patterns {
+				if pattern == "" {
+					continue
+				}
+				re, err := regexp.Compile(pattern)
+				if err != nil {
+					initErr = fmt.Errorf("invalid regex pattern %q: %w", pattern, err)
+					return
+				}
+				regexPatterns = append(regexPatterns, re)
+			}
+		}
+	})
+	return initErr
+}
+
+// parseCommaSeparated splits by comma and trims whitespace
+func parseCommaSeparated(s string) []string {
+	parts := strings.Split(s, ",")
+	result := make([]string, 0, len(parts))
+	for _, p := range parts {
+		trimmed := strings.TrimSpace(p)
+		if trimmed != "" {
+			result = append(result, trimmed)
+		}
+	}
+	return result
+}
+
+// parseSemicolonSeparated splits by semicolon and trims whitespace
+func parseSemicolonSeparated(s string) []string {
+	parts := strings.Split(s, ";")
+	result := make([]string, 0, len(parts))
+	for _, p := range parts {
+		trimmed := strings.TrimSpace(p)
+		if trimmed != "" {
+			result = append(result, trimmed)
+		}
+	}
+	return result
+}
+
+// Sanitize removes configured patterns from content
+func Sanitize(content string) string {
+	if content == "" {
+		return content
+	}
+
+	if err := Init(); err != nil {
+		return ""
+	}
+
+	result := content
+
+	// Remove literal patterns
+	for _, pattern := range literalPatterns {
+		result = strings.ReplaceAll(result, pattern, "")
+	}
+
+	// Remove regex patterns
+	for _, re := range regexPatterns {
+		result = re.ReplaceAllString(result, "")
+	}
+
+	return result
+}
diff --git a/sanitize/sanitize_test.go b/sanitize/sanitize_test.go
@@ -0,0 +1,162 @@
+package sanitize
+
+import (
+	"sync"
+	"testing"
+)
+
+func TestSanitize(t *testing.T) {
+	tests := []struct {
+		name            string
+		literals        string
+		regexes         string
+		input           string
+		expected        string
+		expectInitError bool
+	}{
+		{
+			name:     "no patterns",
+			input:    "Hello World",
+			expected: "Hello World",
+		},
+		{
+			name:     "literal removal",
+			literals: "World",
+			input:    "Hello World",
+			expected: "Hello ",
+		},
+		{
+			name:     "multiple literals",
+			literals: "foo,bar",
+			input:    "foobar baz",
+			expected: " baz",
+		},
+		{
+			name:     "literals with spaces",
+			literals: " foo , bar ",
+			input:    "foo bar baz",
+			expected: "  baz",
+		},
+		{
+			name:     "regex iban",
+			regexes:  `DE\d{20}`,
+			input:    "IBAN: DE56123341212312312312 end",
+			expected: "IBAN:  end",
+		},
+		{
+			name:     "regex email",
+			regexes:  `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`,
+			input:    "Contact: john@example.com or jane@test.org",
+			expected: "Contact:  or ",
+		},
+		{
+			name:     "literal and regex combined",
+			literals: "CONFIDENTIAL",
+			regexes:  `\b\d{4}-\d{4}-\d{4}-\d{4}\b`,
+			input:    "CONFIDENTIAL card: 1234-5678-9012-3456",
+			expected: " card: ",
+		},
+		{
+			name:            "invalid regex",
+			regexes:         `[invalid`,
+			input:           "test",
+			expectInitError: true,
+		},
+		{
+			name:     "empty content",
+			literals: "test",
+			input:    "",
+			expected: "",
+		},
+		{
+			name:     "regex with semicolon separator",
+			regexes:  `[A-Z]{2}\d{2}[A-Z0-9]+;test@example\.com`,
+			input:    "IBAN: DE561233412123123 and email: test@example.com",
+			expected: "IBAN:  and email: ",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Reset state
+			literalPatterns = nil
+			regexPatterns = nil
+			initOnce = sync.Once{}
+			initErr = nil
+
+			// Set env vars
+			t.Setenv("REMOVE_FROM_CONTENT", tt.literals)
+			t.Setenv("REMOVE_FROM_CONTENT_REGEX", tt.regexes)
+
+			// Initialize
+			err := Init()
+			if tt.expectInitError {
+				if err == nil {
+					t.Errorf("expected init error, got nil")
+				}
+				return
+			}
+			if err != nil {
+				t.Fatalf("unexpected init error: %v", err)
+			}
+
+			// Test sanitization
+			result := Sanitize(tt.input)
+			if result != tt.expected {
+				t.Errorf("Sanitize() = %q, want %q", result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestParseCommaSeparated(t *testing.T) {
+	tests := []struct {
+		input    string
+		expected []string
+	}{
+		{"a,b,c", []string{"a", "b", "c"}},
+		{"a, b, c", []string{"a", "b", "c"}},
+		{"  a  ,  b  ", []string{"a", "b"}},
+		{"a,,b", []string{"a", "b"}},
+		{"a", []string{"a"}},
+		{"", []string{}},
+	}
+
+	for _, tt := range tests {
+		result := parseCommaSeparated(tt.input)
+		if len(result) != len(tt.expected) {
+			t.Errorf("parseCommaSeparated(%q) = %v, want %v", tt.input, result, tt.expected)
+			continue
+		}
+		for i := range result {
+			if result[i] != tt.expected[i] {
+				t.Errorf("parseCommaSeparated(%q)[%d] = %q, want %q", tt.input, i, result[i], tt.expected[i])
+			}
+		}
+	}
+}
+
+func TestParseSemicolonSeparated(t *testing.T) {
+	tests := []struct {
+		input    string
+		expected []string
+	}{
+		{`a;b;c`, []string{"a", "b", "c"}},
+		{`a; b; c`, []string{"a", "b", "c"}},
+		{`DE\d{20};[a-z]+`, []string{`DE\d{20}`, `[a-z]+`}},
+		{``, []string{}},
+	}
+
+	for _, tt := range tests {
+		result := parseSemicolonSeparated(tt.input)
+		if len(result) != len(tt.expected) {
+			t.Errorf("parseSemicolonSeparated(%q) = %v, want %v", tt.input, result, tt.expected)
+			continue
+		}
+		for i := range result {
+			if result[i] != tt.expected[i] {
+				t.Errorf("parseSemicolonSeparated(%q)[%d] = %q, want %q", tt.input, i, result[i], tt.expected[i])
+			}
+		}
+	}
+}