docker build -t llm-test-bench:latest .# Set API keys in .env file
echo "OPENAI_API_KEY=your_key_here" > .env
echo "ANTHROPIC_API_KEY=your_key_here" >> .env
# Run benchmark
docker-compose run llm-test-bench bench \
--dataset /data/datasets/coding-tasks.json \
--providers openai \
--metrics faithfulness,relevance \
--output /data/results- Builder stage: ~2.5 GB (includes Rust toolchain)
- Runtime image: ~150 MB (Debian slim + binary)
- ✅ Non-root user (uid 1000)
- ✅ Minimal runtime dependencies
- ✅ No unnecessary packages
- ✅ CA certificates included
- ✅ Read-only filesystem support
Build for multiple platforms:
docker buildx create --use
docker buildx build \
--platform linux/amd64,linux/arm64 \
-t llm-test-bench:latest \
--push .docker run --rm \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
-v $(pwd)/datasets:/data/datasets:ro \
-v $(pwd)/results:/data/results:rw \
llm-test-bench:latest \
bench \
--dataset /data/datasets/coding-tasks.json \
--providers openai \
--metrics faithfulness,relevance \
--output /data/resultsdocker run --rm \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
-e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY \
-v $(pwd)/results:/data/results:rw \
llm-test-bench:latest \
compare \
--prompt "Explain quantum entanglement" \
--models openai:gpt-4,anthropic:claude-3-opus \
--statistical-tests \
--output /data/results/comparison.htmldocker run --rm \
-v $(pwd)/results:/data/results:rw \
llm-test-bench:latest \
dashboard \
--results /data/results/*.json \
--theme dark \
--output /data/results/dashboard.htmldocker run --rm \
-v $(pwd)/results:/data/results:ro \
llm-test-bench:latest \
analyze \
--baseline /data/results/baseline.json \
--comparison /data/results/latest.json \
--fail-on-regressiondocker run --rm \
-v $(pwd)/results:/data/results:ro \
llm-test-bench:latest \
optimize \
--current-model gpt-4 \
--monthly-requests 100000 \
--quality-threshold 0.80Edit docker-compose.yml to customize:
- API keys (set in
.envfile) - Volume mounts
- Resource limits
- Command overrides
# Build
docker-compose build
# Run benchmark
docker-compose run llm-test-bench bench --dataset /data/datasets/test.json
# View logs
docker-compose logs -f
# Stop services
docker-compose down
# Remove volumes
docker-compose down -vkubectl create secret generic llm-api-keys \
--from-literal=openai-key=$OPENAI_API_KEY \
--from-literal=anthropic-key=$ANTHROPIC_API_KEYapiVersion: batch/v1
kind: Job
metadata:
name: llm-benchmark
spec:
template:
spec:
containers:
- name: llm-test-bench
image: llm-test-bench:latest
command:
- llm-test-bench
- bench
- --dataset
- /data/datasets/coding-tasks.json
- --providers
- openai,anthropic
- --metrics
- faithfulness,relevance
- --output
- /data/results
env:
- name: OPENAI_API_KEY
valueFrom:
secretKeyRef:
name: llm-api-keys
key: openai-key
- name: ANTHROPIC_API_KEY
valueFrom:
secretKeyRef:
name: llm-api-keys
key: anthropic-key
volumeMounts:
- name: datasets
mountPath: /data/datasets
readOnly: true
- name: results
mountPath: /data/results
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "2000m"
volumes:
- name: datasets
configMap:
name: llm-datasets
- name: results
persistentVolumeClaim:
claimName: llm-results
restartPolicy: Never
backoffLimit: 3apiVersion: batch/v1
kind: CronJob
metadata:
name: daily-benchmark
spec:
schedule: "0 0 * * *" # Daily at midnight
jobTemplate:
spec:
template:
spec:
# Same as Job aboveSee .github/workflows/llm-benchmark.yml for complete example.
Key features:
- Daily scheduled benchmarks
- Regression detection
- Artifact upload
- PR comments with results
benchmark:
stage: test
image: llm-test-bench:latest
script:
- llm-test-bench bench
--dataset datasets/coding-tasks.json
--providers openai
--metrics faithfulness,relevance
--output results
artifacts:
paths:
- results/
expire_in: 90 days
only:
- schedulesRecommended limits:
- CPU: 0.5-2.0 cores
- Memory: 512MB-2GB
- Disk: 1GB for cache
Enable evaluation caching to reduce costs:
environment:
- LLM_TEST_BENCH_EVALUATION__CACHE_ENABLED=true
volumes:
- llm-cache:/data/cache:rwCache can reduce API costs by 80%+.
Control parallel execution:
environment:
- LLM_TEST_BENCH_ORCHESTRATION__MAX_PARALLEL_MODELS=5# Clear build cache
docker builder prune -a
# Build with no cache
docker build --no-cache -t llm-test-bench:latest .# Ensure volumes have correct permissions
chmod -R 755 results/
chown -R 1000:1000 results/# Verify keys are set
docker run --rm llm-test-bench:latest env | grep API_KEY
# Test with simple command
docker run --rm \
-e OPENAI_API_KEY=$OPENAI_API_KEY \
llm-test-bench:latest \
config show# Increase memory limit
docker run --memory=2g llm-test-bench:latest ...
# Or in docker-compose.yml:
deploy:
resources:
limits:
memory: 2G- Never commit API keys - Use environment variables or secrets
- Use non-root user - Already configured in Dockerfile
- Read-only volumes - Mount datasets as read-only
- Resource limits - Set CPU and memory limits
- Network isolation - Use Docker networks
- Image scanning - Scan for vulnerabilities regularly
# Scan with Trivy
trivy image llm-test-bench:latestAdd to docker-compose.yml:
healthcheck:
test: ["CMD", "llm-test-bench", "--version"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s# View logs
docker logs llm-test-bench
# Follow logs
docker logs -f llm-test-bench
# Export logs
docker logs llm-test-bench > llm-bench.log 2>&1- API keys stored in secure secrets manager
- Resource limits configured
- Health checks enabled
- Logging configured
- Monitoring alerts set up
- Backup strategy for results
- Cache persistence configured
- Image vulnerability scan passed
- Network security configured
- Documentation updated
For issues or questions:
- GitHub Issues: https://github.com/your-org/llm-test-bench/issues
- Documentation: https://docs.llm-test-bench.io