-
-
Notifications
You must be signed in to change notification settings - Fork 174
Content Sanitization for LLM API Calls #917
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
BieggerM
wants to merge
7
commits into
icereed:main
Choose a base branch
from
BieggerM:main
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
3b12d1e
Add content sanitization feature for removing sensitive data
BieggerM 770eac1
Fix Docker build: copy sanitize package to build context
BieggerM 97f3db8
Fix: sanitize document content in custom fields processing
BieggerM 8d1d95a
Fix: sanitize document content in ad-hoc analysis
BieggerM 1dae1cd
Merge pull request #1 from BieggerM/sanitize
BieggerM fccd7e9
fix(sanitize): reset initErr in tests to prevent state leakage
BieggerM c688344
fix(sanitize): add defensive init check to fail closed on error
BieggerM File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| // Package sanitize provides content sanitization functionality for removing sensitive | ||
| // strings and patterns before sending content to LLMs. | ||
| // | ||
| // Configuration is done via environment variables: | ||
| // - REMOVE_FROM_CONTENT: Comma-separated list of literal strings to remove | ||
| // - REMOVE_FROM_CONTENT_REGEX: Semicolon-separated list of regex patterns to remove | ||
| // | ||
| // Example usage: | ||
| // | ||
| // if err := sanitize.Init(); err != nil { | ||
| // log.Fatal(err) | ||
| // } | ||
| // cleanContent := sanitize.Sanitize(dirtyContent) | ||
| package sanitize |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,94 @@ | ||
| package sanitize | ||
|
|
||
| import ( | ||
| "fmt" | ||
| "os" | ||
| "regexp" | ||
| "strings" | ||
| "sync" | ||
| ) | ||
|
|
||
| var ( | ||
| literalPatterns []string | ||
| regexPatterns []*regexp.Regexp | ||
| initOnce sync.Once | ||
| initErr error | ||
| ) | ||
|
|
||
| // Init initializes sanitization patterns from environment variables | ||
| func Init() error { | ||
| initOnce.Do(func() { | ||
| // Parse literal patterns | ||
| if literals := os.Getenv("REMOVE_FROM_CONTENT"); literals != "" { | ||
| literalPatterns = parseCommaSeparated(literals) | ||
| } | ||
|
|
||
| // Parse regex patterns | ||
| if regexStr := os.Getenv("REMOVE_FROM_CONTENT_REGEX"); regexStr != "" { | ||
| patterns := parseSemicolonSeparated(regexStr) | ||
| for _, pattern := range patterns { | ||
| if pattern == "" { | ||
| continue | ||
| } | ||
| re, err := regexp.Compile(pattern) | ||
| if err != nil { | ||
| initErr = fmt.Errorf("invalid regex pattern %q: %w", pattern, err) | ||
| return | ||
| } | ||
| regexPatterns = append(regexPatterns, re) | ||
| } | ||
| } | ||
| }) | ||
| return initErr | ||
| } | ||
|
|
||
| // parseCommaSeparated splits by comma and trims whitespace | ||
| func parseCommaSeparated(s string) []string { | ||
| parts := strings.Split(s, ",") | ||
| result := make([]string, 0, len(parts)) | ||
| for _, p := range parts { | ||
| trimmed := strings.TrimSpace(p) | ||
| if trimmed != "" { | ||
| result = append(result, trimmed) | ||
| } | ||
| } | ||
| return result | ||
| } | ||
|
|
||
| // parseSemicolonSeparated splits by semicolon and trims whitespace | ||
| func parseSemicolonSeparated(s string) []string { | ||
| parts := strings.Split(s, ";") | ||
| result := make([]string, 0, len(parts)) | ||
| for _, p := range parts { | ||
| trimmed := strings.TrimSpace(p) | ||
| if trimmed != "" { | ||
| result = append(result, trimmed) | ||
| } | ||
| } | ||
| return result | ||
| } | ||
|
|
||
| // Sanitize removes configured patterns from content | ||
| func Sanitize(content string) string { | ||
| if content == "" { | ||
| return content | ||
| } | ||
|
|
||
| if err := Init(); err != nil { | ||
| return "" | ||
| } | ||
|
|
||
| result := content | ||
|
|
||
| // Remove literal patterns | ||
| for _, pattern := range literalPatterns { | ||
| result = strings.ReplaceAll(result, pattern, "") | ||
| } | ||
|
|
||
| // Remove regex patterns | ||
| for _, re := range regexPatterns { | ||
| result = re.ReplaceAllString(result, "") | ||
| } | ||
|
|
||
| return result | ||
| } | ||
|
BieggerM marked this conversation as resolved.
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,162 @@ | ||
| package sanitize | ||
|
|
||
| import ( | ||
| "sync" | ||
| "testing" | ||
| ) | ||
|
|
||
| func TestSanitize(t *testing.T) { | ||
| tests := []struct { | ||
| name string | ||
| literals string | ||
| regexes string | ||
| input string | ||
| expected string | ||
| expectInitError bool | ||
| }{ | ||
| { | ||
| name: "no patterns", | ||
| input: "Hello World", | ||
| expected: "Hello World", | ||
| }, | ||
| { | ||
| name: "literal removal", | ||
| literals: "World", | ||
| input: "Hello World", | ||
| expected: "Hello ", | ||
| }, | ||
| { | ||
| name: "multiple literals", | ||
| literals: "foo,bar", | ||
| input: "foobar baz", | ||
| expected: " baz", | ||
| }, | ||
| { | ||
| name: "literals with spaces", | ||
| literals: " foo , bar ", | ||
| input: "foo bar baz", | ||
| expected: " baz", | ||
| }, | ||
| { | ||
| name: "regex iban", | ||
| regexes: `DE\d{20}`, | ||
| input: "IBAN: DE56123341212312312312 end", | ||
| expected: "IBAN: end", | ||
| }, | ||
| { | ||
| name: "regex email", | ||
| regexes: `[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}`, | ||
| input: "Contact: john@example.com or jane@test.org", | ||
| expected: "Contact: or ", | ||
| }, | ||
| { | ||
| name: "literal and regex combined", | ||
| literals: "CONFIDENTIAL", | ||
| regexes: `\b\d{4}-\d{4}-\d{4}-\d{4}\b`, | ||
| input: "CONFIDENTIAL card: 1234-5678-9012-3456", | ||
| expected: " card: ", | ||
| }, | ||
| { | ||
| name: "invalid regex", | ||
| regexes: `[invalid`, | ||
| input: "test", | ||
| expectInitError: true, | ||
| }, | ||
| { | ||
| name: "empty content", | ||
| literals: "test", | ||
| input: "", | ||
| expected: "", | ||
| }, | ||
| { | ||
| name: "regex with semicolon separator", | ||
| regexes: `[A-Z]{2}\d{2}[A-Z0-9]+;test@example\.com`, | ||
| input: "IBAN: DE561233412123123 and email: test@example.com", | ||
| expected: "IBAN: and email: ", | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| // Reset state | ||
| literalPatterns = nil | ||
| regexPatterns = nil | ||
| initOnce = sync.Once{} | ||
| initErr = nil | ||
|
|
||
| // Set env vars | ||
| t.Setenv("REMOVE_FROM_CONTENT", tt.literals) | ||
| t.Setenv("REMOVE_FROM_CONTENT_REGEX", tt.regexes) | ||
|
|
||
| // Initialize | ||
| err := Init() | ||
| if tt.expectInitError { | ||
| if err == nil { | ||
| t.Errorf("expected init error, got nil") | ||
| } | ||
| return | ||
| } | ||
| if err != nil { | ||
| t.Fatalf("unexpected init error: %v", err) | ||
| } | ||
|
|
||
| // Test sanitization | ||
| result := Sanitize(tt.input) | ||
| if result != tt.expected { | ||
| t.Errorf("Sanitize() = %q, want %q", result, tt.expected) | ||
| } | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestParseCommaSeparated(t *testing.T) { | ||
| tests := []struct { | ||
| input string | ||
| expected []string | ||
| }{ | ||
| {"a,b,c", []string{"a", "b", "c"}}, | ||
| {"a, b, c", []string{"a", "b", "c"}}, | ||
| {" a , b ", []string{"a", "b"}}, | ||
| {"a,,b", []string{"a", "b"}}, | ||
| {"a", []string{"a"}}, | ||
| {"", []string{}}, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| result := parseCommaSeparated(tt.input) | ||
| if len(result) != len(tt.expected) { | ||
| t.Errorf("parseCommaSeparated(%q) = %v, want %v", tt.input, result, tt.expected) | ||
| continue | ||
| } | ||
| for i := range result { | ||
| if result[i] != tt.expected[i] { | ||
| t.Errorf("parseCommaSeparated(%q)[%d] = %q, want %q", tt.input, i, result[i], tt.expected[i]) | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| func TestParseSemicolonSeparated(t *testing.T) { | ||
| tests := []struct { | ||
| input string | ||
| expected []string | ||
| }{ | ||
| {`a;b;c`, []string{"a", "b", "c"}}, | ||
| {`a; b; c`, []string{"a", "b", "c"}}, | ||
| {`DE\d{20};[a-z]+`, []string{`DE\d{20}`, `[a-z]+`}}, | ||
| {``, []string{}}, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| result := parseSemicolonSeparated(tt.input) | ||
| if len(result) != len(tt.expected) { | ||
| t.Errorf("parseSemicolonSeparated(%q) = %v, want %v", tt.input, result, tt.expected) | ||
| continue | ||
| } | ||
| for i := range result { | ||
| if result[i] != tt.expected[i] { | ||
| t.Errorf("parseSemicolonSeparated(%q)[%d] = %q, want %q", tt.input, i, result[i], tt.expected[i]) | ||
| } | ||
| } | ||
| } | ||
| } |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.