Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ COPY --from=frontend /app/dist /app/web-app/dist
# Copy the Go source files
COPY *.go .
COPY ocr ./ocr
COPY sanitize ./sanitize

# Import ARGs from top level
ARG VERSION
Expand Down
3 changes: 3 additions & 0 deletions app_http_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"text/template"
"time"

"paperless-gpt/sanitize"

"github.com/Masterminds/sprig/v3"
"github.com/gin-gonic/gin"
)
Expand Down Expand Up @@ -606,6 +608,7 @@ func (app *App) analyzeDocumentsHandler(c *gin.Context) {
log.Errorf("Error fetching document %d: %v", docID, err)
return
}
doc.Content = sanitize.Sanitize(doc.Content)
documents = append(documents, doc)
}

Expand Down
6 changes: 4 additions & 2 deletions app_llm.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ import (

_ "image/jpeg"

"paperless-gpt/sanitize"

"github.com/sirupsen/logrus"
"github.com/tmc/langchaingo/llms"
)
Expand Down Expand Up @@ -399,7 +401,7 @@ func (app *App) getSuggestedCustomFields(ctx context.Context, doc Document, sele
return nil, fmt.Errorf("error calculating available tokens for custom fields: %v", err)
}

truncatedContent, err := truncateContentByTokens(doc.Content, availableTokens)
truncatedContent, err := truncateContentByTokens(sanitize.Sanitize(doc.Content), availableTokens)
if err != nil {
return nil, fmt.Errorf("error truncating content for custom fields: %v", err)
}
Expand Down Expand Up @@ -527,7 +529,7 @@ func (app *App) generateDocumentSuggestions(ctx context.Context, suggestionReque
startTime := time.Now()
docLogger.Printf("Processing Document ID %d...", documentID)

content := doc.Content
content := sanitize.Sanitize(doc.Content)
suggestedTitle := doc.Title
var suggestedTags []string
var suggestedCorrespondent string
Expand Down
6 changes: 6 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http"
"os"
"paperless-gpt/ocr"
"paperless-gpt/sanitize"
"path/filepath"
"runtime"
"strconv"
Expand Down Expand Up @@ -148,6 +149,11 @@ func main() {
// Initialize logrus logger
initLogger()

// Initialize content sanitization
if err := sanitize.Init(); err != nil {
log.Fatalf("Failed to initialize sanitization: %v", err)
}

// Load settings from file
loadSettings()

Expand Down
3 changes: 2 additions & 1 deletion ocr/llm_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/tmc/langchaingo/llms/mistral"
"github.com/tmc/langchaingo/llms/ollama"
"github.com/tmc/langchaingo/llms/openai"
"paperless-gpt/sanitize"
)

// LLMProvider implements OCR using LLM vision models
Expand Down Expand Up @@ -135,7 +136,7 @@ func (p *LLMProvider) ProcessImage(ctx context.Context, imageContent []byte, pag

parts = []llms.ContentPart{
contentPart,
llms.TextPart(p.prompt),
llms.TextPart(sanitize.Sanitize(p.prompt)),
}

var callOpts []llms.CallOption
Expand Down
14 changes: 14 additions & 0 deletions sanitize/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Package sanitize provides content sanitization functionality for removing sensitive
// strings and patterns before sending content to LLMs.
//
// Configuration is done via environment variables:
// - REMOVE_FROM_CONTENT: Comma-separated list of literal strings to remove
// - REMOVE_FROM_CONTENT_REGEX: Semicolon-separated list of regex patterns to remove
//
// Example usage:
//
// if err := sanitize.Init(); err != nil {
// log.Fatal(err)
// }
// cleanContent := sanitize.Sanitize(dirtyContent)
package sanitize
94 changes: 94 additions & 0 deletions sanitize/sanitize.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package sanitize

import (
"fmt"
"os"
"regexp"
"strings"
"sync"
)

var (
literalPatterns []string
regexPatterns []*regexp.Regexp
initOnce sync.Once
initErr error
)

// Init initializes sanitization patterns from environment variables
func Init() error {
initOnce.Do(func() {
// Parse literal patterns
if literals := os.Getenv("REMOVE_FROM_CONTENT"); literals != "" {
literalPatterns = parseCommaSeparated(literals)
}

// Parse regex patterns
if regexStr := os.Getenv("REMOVE_FROM_CONTENT_REGEX"); regexStr != "" {
patterns := parseSemicolonSeparated(regexStr)
for _, pattern := range patterns {
if pattern == "" {
continue
}
re, err := regexp.Compile(pattern)
if err != nil {
initErr = fmt.Errorf("invalid regex pattern %q: %w", pattern, err)
return
}
regexPatterns = append(regexPatterns, re)
}
}
})
return initErr
}
Comment thread
BieggerM marked this conversation as resolved.

// parseCommaSeparated splits by comma and trims whitespace
func parseCommaSeparated(s string) []string {
parts := strings.Split(s, ",")
result := make([]string, 0, len(parts))
for _, p := range parts {
trimmed := strings.TrimSpace(p)
if trimmed != "" {
result = append(result, trimmed)
}
}
return result
}

// parseSemicolonSeparated splits by semicolon and trims whitespace
func parseSemicolonSeparated(s string) []string {
parts := strings.Split(s, ";")
result := make([]string, 0, len(parts))
for _, p := range parts {
trimmed := strings.TrimSpace(p)
if trimmed != "" {
result = append(result, trimmed)
}
}
return result
}

// Sanitize removes configured patterns from content
func Sanitize(content string) string {
if content == "" {
return content
}

if err := Init(); err != nil {
return ""
}

result := content

// Remove literal patterns
for _, pattern := range literalPatterns {
result = strings.ReplaceAll(result, pattern, "")
}

// Remove regex patterns
for _, re := range regexPatterns {
result = re.ReplaceAllString(result, "")
}

return result
}
Comment thread
BieggerM marked this conversation as resolved.
162 changes: 162 additions & 0 deletions sanitize/sanitize_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
package sanitize

import (
"sync"
"testing"
)

func TestSanitize(t *testing.T) {
tests := []struct {
name string
literals string
regexes string
input string
expected string
expectInitError bool
}{
{
name: "no patterns",
input: "Hello World",
expected: "Hello World",
},
{
name: "literal removal",
literals: "World",
input: "Hello World",
expected: "Hello ",
},
{
name: "multiple literals",
literals: "foo,bar",
input: "foobar baz",
expected: " baz",
},
{
name: "literals with spaces",
literals: " foo , bar ",
input: "foo bar baz",
expected: " baz",
},
{
name: "regex iban",
regexes: `DE\d{20}`,
input: "IBAN: DE56123341212312312312 end",
expected: "IBAN: end",
},
{
name: "regex email",
regexes: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`,
input: "Contact: john@example.com or jane@test.org",
expected: "Contact: or ",
},
{
name: "literal and regex combined",
literals: "CONFIDENTIAL",
regexes: `\b\d{4}-\d{4}-\d{4}-\d{4}\b`,
input: "CONFIDENTIAL card: 1234-5678-9012-3456",
expected: " card: ",
},
{
name: "invalid regex",
regexes: `[invalid`,
input: "test",
expectInitError: true,
},
{
name: "empty content",
literals: "test",
input: "",
expected: "",
},
{
name: "regex with semicolon separator",
regexes: `[A-Z]{2}\d{2}[A-Z0-9]+;test@example\.com`,
input: "IBAN: DE561233412123123 and email: test@example.com",
expected: "IBAN: and email: ",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Reset state
literalPatterns = nil
regexPatterns = nil
initOnce = sync.Once{}
initErr = nil

// Set env vars
t.Setenv("REMOVE_FROM_CONTENT", tt.literals)
t.Setenv("REMOVE_FROM_CONTENT_REGEX", tt.regexes)

// Initialize
err := Init()
if tt.expectInitError {
if err == nil {
t.Errorf("expected init error, got nil")
}
return
}
if err != nil {
t.Fatalf("unexpected init error: %v", err)
}

// Test sanitization
result := Sanitize(tt.input)
if result != tt.expected {
t.Errorf("Sanitize() = %q, want %q", result, tt.expected)
}
})
}
}

func TestParseCommaSeparated(t *testing.T) {
tests := []struct {
input string
expected []string
}{
{"a,b,c", []string{"a", "b", "c"}},
{"a, b, c", []string{"a", "b", "c"}},
{" a , b ", []string{"a", "b"}},
{"a,,b", []string{"a", "b"}},
{"a", []string{"a"}},
{"", []string{}},
}

for _, tt := range tests {
result := parseCommaSeparated(tt.input)
if len(result) != len(tt.expected) {
t.Errorf("parseCommaSeparated(%q) = %v, want %v", tt.input, result, tt.expected)
continue
}
for i := range result {
if result[i] != tt.expected[i] {
t.Errorf("parseCommaSeparated(%q)[%d] = %q, want %q", tt.input, i, result[i], tt.expected[i])
}
}
}
}

func TestParseSemicolonSeparated(t *testing.T) {
tests := []struct {
input string
expected []string
}{
{`a;b;c`, []string{"a", "b", "c"}},
{`a; b; c`, []string{"a", "b", "c"}},
{`DE\d{20};[a-z]+`, []string{`DE\d{20}`, `[a-z]+`}},
{``, []string{}},
}

for _, tt := range tests {
result := parseSemicolonSeparated(tt.input)
if len(result) != len(tt.expected) {
t.Errorf("parseSemicolonSeparated(%q) = %v, want %v", tt.input, result, tt.expected)
continue
}
for i := range result {
if result[i] != tt.expected[i] {
t.Errorf("parseSemicolonSeparated(%q)[%d] = %q, want %q", tt.input, i, result[i], tt.expected[i])
}
}
}
}