Skip to content

Commit 066f87c

Browse files
Refactor and introduce a benckmark
1 parent 680c0b4 commit 066f87c

File tree

9 files changed

+480
-40
lines changed

9 files changed

+480
-40
lines changed

benches/README.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# 2MS Benchmarks
2+
3+
## Process Items Benchmark
4+
5+
This benchmark (`BenchmarkProcessItems`) tests the performance of secret detection processing across different configurations.
6+
7+
### What it Tests
8+
9+
1. **Worker Pool Scaling**
10+
- Tests different worker pool sizes based on CPU count
11+
- Ranges from half the CPU count up to 32x CPU count
12+
- Example for 8-core machine: tests 4, 8, 16, 32, 64, 128, and 256 workers
13+
14+
2. **Input Load Testing**
15+
- Tests various input sizes: 50, 100, 500, 1000, and 10000 items
16+
17+
3. **Realistic Content**
18+
- Simulates different file types:
19+
- JavaScript configurations
20+
- Python scripts
21+
- Shell scripts
22+
- YAML configurations
23+
- JSON configurations
24+
- Includes actual secret patterns:
25+
- GitHub Personal Access Tokens
26+
- API keys
27+
- JWTs
28+
- Varies file sizes (1KB, 10KB, 50KB)
29+
- Maintains a 60/40 ratio of files with/without secrets
30+
31+
### Running the Benchmark
32+
33+
```bash
34+
go test -timeout 0 -bench BenchmarkProcessItems -count 5 -run=^$
35+
```
36+
37+
#### Command Flags Explained
38+
- `-timeout 0`: Disables test timeout (needed for long benchmarks)
39+
- `-bench BenchmarkProcessItems`: Runs only this specific benchmark
40+
- `-count 5`: Runs the benchmark 5 times for better statistical significance
41+
- `-run=^$`: Skips regular tests (only runs benchmarks)
42+
43+
#### Note
44+
The benchmark will produce logging output by default. To disable logs during benchmarking, you can set the zerolog global level to Disabled before running.

benches/process_items_test.go

Lines changed: 295 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,295 @@
1+
package benches
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"runtime"
7+
"strings"
8+
"sync"
9+
"testing"
10+
11+
"github.com/checkmarx/2ms/v3/engine"
12+
"github.com/checkmarx/2ms/v3/internal/workerpool"
13+
"github.com/checkmarx/2ms/v3/lib/reporting"
14+
"github.com/checkmarx/2ms/v3/lib/secrets"
15+
"github.com/checkmarx/2ms/v3/plugins"
16+
)
17+
18+
type mockItem struct {
19+
content *string
20+
id string
21+
source string
22+
}
23+
24+
func (i *mockItem) GetContent() *string {
25+
return i.content
26+
}
27+
28+
func (i *mockItem) GetID() string {
29+
return i.id
30+
}
31+
32+
func (i *mockItem) GetSource() string {
33+
return i.source
34+
}
35+
36+
func (i *mockItem) GetGitInfo() *plugins.GitInfo {
37+
return nil
38+
}
39+
40+
// BenchmarkProcessItems benchmarks ProcessItems with realistic content that includes actual secrets
41+
//
42+
// Note: This benchmark will produce logging output because the worker pool logs at Info level.
43+
// To run without log spam, put somewhere zerolog.SetGlobalLevel(zerolog.Disabled)
44+
func BenchmarkProcessItems(b *testing.B) {
45+
nCPU := runtime.GOMAXPROCS(0)
46+
fmt.Println("nCPU", nCPU)
47+
workerSizes := []int{nCPU / 2, nCPU, nCPU * 2, nCPU * 4, nCPU * 8, nCPU * 16, nCPU * 32}
48+
itemSizes := []int{50, 100, 500, 1000, 10000}
49+
50+
// Secret patterns that will trigger detection
51+
secretPatterns := []string{
52+
"github_pat_11ABCDEFG1234567890abcdefghijklmnopqrstuvwxyz123456",
53+
"sk-1234567890abcdefghijklmnopqrstuvwxyz",
54+
"ghp_abcdefghijklmnopqrstuvwxyz1234567890",
55+
"AIzaSyC1234567890abcdefghijklmnopqrstuv",
56+
"xoxb-123456789012-1234567890123-abcdefghijklmnopqrstuvwx",
57+
"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ.SflKxwRJSMeKKF2QT4fwpMeJf36POk6yJV_adQssw5c",
58+
}
59+
60+
// Content templates simulating different file types
61+
contentTemplates := []string{
62+
// JavaScript config file
63+
`const config = {
64+
apiKey: '%s',
65+
endpoint: 'https://api.example.com',
66+
timeout: 5000,
67+
retries: 3,
68+
debug: process.env.NODE_ENV === 'development'
69+
};
70+
71+
module.exports = config;`,
72+
// Python script
73+
`import requests
74+
import os
75+
76+
API_KEY = '%s'
77+
BASE_URL = 'https://api.service.com/v1'
78+
79+
def make_request(endpoint):
80+
headers = {
81+
'Authorization': f'Bearer {API_KEY}',
82+
'Content-Type': 'application/json'
83+
}
84+
return requests.get(f'{BASE_URL}/{endpoint}', headers=headers)
85+
86+
if __name__ == '__main__':
87+
response = make_request('users')
88+
print(response.json())`,
89+
// Shell script
90+
`#!/bin/bash
91+
92+
# Configuration
93+
export API_TOKEN='%s'
94+
export SERVICE_URL="https://service.example.com"
95+
export ENVIRONMENT="production"
96+
97+
# Function to call API
98+
call_api() {
99+
curl -H "Authorization: Bearer $API_TOKEN" \
100+
-H "Content-Type: application/json" \
101+
"$SERVICE_URL/api/$1"
102+
}
103+
104+
# Main execution
105+
call_api "status"`,
106+
// YAML config
107+
`apiVersion: v1
108+
kind: ConfigMap
109+
metadata:
110+
name: app-config
111+
data:
112+
database_url: postgresql://user:pass@localhost/db
113+
api_key: %s
114+
redis_url: redis://localhost:6379
115+
log_level: info`,
116+
// JSON config
117+
`{
118+
"name": "production-app",
119+
"version": "1.0.0",
120+
"config": {
121+
"api": {
122+
"key": "%s",
123+
"endpoint": "https://api.production.com",
124+
"timeout": 30000
125+
},
126+
"database": {
127+
"host": "db.production.com",
128+
"port": 5432
129+
}
130+
}
131+
}`,
132+
// No secret - regular code
133+
`package utils
134+
135+
import (
136+
"fmt"
137+
"strings"
138+
"time"
139+
)
140+
141+
func ProcessData(input string) (string, error) {
142+
if input == "" {
143+
return "", fmt.Errorf("input cannot be empty")
144+
}
145+
146+
processed := strings.ToUpper(input)
147+
timestamp := time.Now().Format(time.RFC3339)
148+
149+
return fmt.Sprintf("%s - %s", processed, timestamp), nil
150+
}
151+
152+
func ValidateInput(data []byte) bool {
153+
return len(data) > 0 && len(data) < 1048576
154+
}`,
155+
}
156+
157+
for _, workers := range workerSizes {
158+
for _, items := range itemSizes {
159+
b.Run(fmt.Sprintf("realistic_workers_%d_items_%d", workers, items), func(b *testing.B) {
160+
// Pre-create realistic mock items
161+
mockItems := make([]*mockItem, items)
162+
for j := 0; j < items; j++ {
163+
var content string
164+
165+
// 60% of files contain secrets, 40% don't
166+
if j%10 < 6 {
167+
// Select a random template and secret
168+
template := contentTemplates[j%len(contentTemplates)]
169+
secret := secretPatterns[j%len(secretPatterns)]
170+
content = fmt.Sprintf(template, secret)
171+
} else {
172+
// Use non-secret content
173+
content = contentTemplates[len(contentTemplates)-1]
174+
}
175+
176+
// Add some padding to simulate larger files
177+
padding := generateRealisticPadding(j)
178+
content += padding
179+
180+
mockItems[j] = &mockItem{
181+
content: &content,
182+
id: fmt.Sprintf("file_%d", j),
183+
source: fmt.Sprintf("/mock/path/file_%d.js", j),
184+
}
185+
}
186+
187+
b.ResetTimer()
188+
for i := 0; i < b.N; i++ {
189+
// Create engine for each iteration
190+
engineTest, err := engine.Init(&engine.EngineConfig{
191+
DetectorWorkerPoolSize: workers,
192+
})
193+
if err != nil {
194+
b.Fatal(err)
195+
}
196+
197+
// Create fresh channels
198+
itemsChan := make(chan plugins.ISourceItem, items)
199+
secretsChan := make(chan *secrets.Secret, items*2) // Larger buffer for found secrets
200+
report := reporting.Init()
201+
wg := &sync.WaitGroup{}
202+
wg.Add(1)
203+
204+
// Process items
205+
go func() {
206+
defer wg.Done()
207+
processItemsLocal(engineTest, "mockPlugin", itemsChan, secretsChan, report)
208+
engineTest.GetFileWalkerWorkerPool().Wait()
209+
close(secretsChan)
210+
}()
211+
212+
// Send items
213+
for _, item := range mockItems {
214+
itemsChan <- item
215+
}
216+
close(itemsChan)
217+
218+
// Wait for processing
219+
wg.Wait()
220+
221+
// Collect secrets (simulating what the real code does)
222+
secretsFound := 0
223+
for range secretsChan {
224+
secretsFound++
225+
}
226+
227+
// Clean up
228+
_ = engineTest.Shutdown()
229+
}
230+
})
231+
}
232+
}
233+
}
234+
235+
// generateRealisticPadding generates padding content to simulate realistic file sizes
236+
func generateRealisticPadding(seed int) string {
237+
// Size categories: small (1KB), medium (10KB), large (50KB)
238+
sizes := []int{1024, 10240, 51200}
239+
sizeIndex := seed % len(sizes)
240+
targetSize := sizes[sizeIndex]
241+
242+
// Common code patterns for padding
243+
patterns := []string{
244+
"\n\n// Helper functions\n",
245+
"function helper() { return true; }\n",
246+
"const data = { id: 1, name: 'test' };\n",
247+
"if (condition) { console.log('debug'); }\n",
248+
"// TODO: refactor this later\n",
249+
"/* eslint-disable no-unused-vars */\n",
250+
"import { util } from './utils';\n",
251+
"export default class Component {}\n",
252+
}
253+
254+
var builder strings.Builder
255+
currentSize := 0
256+
patternIndex := 0
257+
258+
for currentSize < targetSize {
259+
pattern := patterns[patternIndex%len(patterns)]
260+
builder.WriteString(pattern)
261+
currentSize += len(pattern)
262+
patternIndex++
263+
}
264+
265+
return builder.String()
266+
}
267+
268+
// Local version of processItems that doesn't use global variables
269+
func processItemsLocal(eng engine.IEngine, pluginName string, items chan plugins.ISourceItem, secrets chan *secrets.Secret, report *reporting.Report) {
270+
ctx := context.Background()
271+
pool := eng.GetFileWalkerWorkerPool()
272+
273+
for item := range items {
274+
report.TotalItemsScanned++
275+
item := item // capture loop variable
276+
277+
var task workerpool.Task
278+
switch pluginName {
279+
case "filesystem":
280+
task = func(context.Context) error {
281+
return eng.DetectFile(ctx, item, secrets)
282+
}
283+
default:
284+
task = func(context.Context) error {
285+
return eng.DetectFragment(item, secrets, pluginName)
286+
}
287+
}
288+
289+
if err := pool.Submit(task); err != nil {
290+
// Handle error appropriately
291+
break
292+
}
293+
}
294+
pool.CloseQueue()
295+
}

cmd/main.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ func Execute() (int, error) {
114114
BoolVar(&validateVar, validate, false, "trigger additional validation to check if discovered secrets are valid or invalid")
115115

116116
rootCmd.AddCommand(engine.GetRulesCommand(&engineConfigVar))
117+
if detectorWorkerPoolSize := vConfig.GetInt("2MS_DETECTOR_WORKERPOOL_SIZE"); detectorWorkerPoolSize != 0 {
118+
engineConfigVar.DetectorWorkerPoolSize = detectorWorkerPoolSize
119+
log.Info().Msgf("2MS_DETECTOR_WORKERPOOL_SIZE is set to %d", detectorWorkerPoolSize)
120+
}
117121

118122
group := "Scan Commands"
119123
rootCmd.AddGroup(&cobra.Group{Title: group, ID: group})

0 commit comments

Comments
 (0)