This document provides complete documentation for all configuration options available in the Kubernetes Inference Performance Benchmark tool.
Controls the API interaction behavior:
api:
type: completion # API type (completion|chat) (default: completion), completion is the default since the chat API is not typically enabled on model servers such as vLLM by default without additional configuration.
streaming: false # Enable/disable streaming (default: false), needs to be enabled for metrics like TTFT, ITL and TPOT to be measured
headers: # Add custom http headers to the request sent to the inference server
x-inference-model: llama
x-routing-strategy: round-robinConfigures the test data generation methodology:
data:
type: mock|shareGPT|synthetic|random|shared_prefix|cnn_dailymail|billsum_conversations|infinity_instruct # Data generation type
path: ./data/shareGPT/ShareGPT_V3_unfiltered_cleaned_split.json # For shareGPT type, path where dataset to be used is present. Path needs to be set for cnn_dailymail, billsum_conversations and infinity_instruct as well
input_distribution: # For synthetic/random types
min: 10 # Minimum prompt length (tokens)
max: 100 # Maximum prompt length
mean: 50 # Average length
std_dev: 10 # Standard deviation
total_count: 100 # Total prompts to generate
output_distribution: # Same structure as input_distribution
min: 10
max: 100
mean: 50
std_dev: 10
total_count: 100
shared_prefix: # For shared_prefix type
num_groups: 10 # Number of shared prefix groups
num_prompts_per_group: 10 # Unique questions per group
system_prompt_len: 100 # Shared prefix length (tokens)
question_len: 50 # Question length (tokens)
output_len: 50 # Target output length (tokens) Defines the benchmarking load pattern:
load:
type: constant|poisson|concurrent # Load pattern type
interval: 1.0 # Seconds between request batches
stages: # Load progression stages
- rate: 1 # Requests per second (CONSTANT or POISSON LOADS)
duration: 30 # Seconds to maintain this rate (CONSTANT or POISSON LOADS)
concurrency_level: 3 # Level of concurrency/number of worker threads (CONCURRENT LOADS)
num_requests: 40 # Number of requests to be processed by concurrency_level worker threads (CONCURRENT LOADS)
num_workers: 4 # Concurrent worker threads (default: CPU_cores)
worker_max_concurrency: 10 # Max concurrent requests per worker
worker_max_tcp_connections: 2500 # Max TCP connections per worker
lora_traffic_split: # Optional: MultiLoRA traffic splitting
- name: adapter_1 # LoRA adapter name
split: 0.5 # Traffic weight (must sum to 1.0)
- name: adapter_2
split: 0.5Defines the preprocessing phase to determine load based on target service saturation.
load:
type: constant|poisson
interval: 15
sweep: # Automatically determine saturation point of the target service and generate stages
type: linear|geometric # Produce a linear distribution [1.0, saturation] of rates for num_stages or geometric distribution clustered around the saturation point
timeout: 60 # Length of time to run load to determine saturation
num_stages: 5 # Number of stages to generate
stage_duration: 180 # Duration of each generated stage
saturation_percentile: 95 # Percentile of sampled rates to select as saturation pointConfigures connection to the model serving backend:
server:
type: vllm # Currently only vLLM supported
model_name: "HuggingFaceTB/SmolLM2-135M-Instruct" # Required model identifier
base_url: "http://0.0.0.0:8000" # Required server endpoint
ignore_eos: true # Whether to ignore End-of-Sequence tokens
api_key: "" # Optional API key for authenticated endpointsSets up performance metrics collection:
metrics:
type: prometheus|default # Metrics backend type
prometheus: # Required when type=prometheus
url: "http://localhost:9090" # Prometheus server URL
scrape_interval: 15 # Metrics scrape interval (seconds)
google_managed: false # Whether using Google Managed Prometheus (see 'Google Managed Prometheus (GMP) Requirements' section)
filters: [] # List of metric names to collectWhen setting google_managed: true, inference-perf queries the GMP API directly. You must configure Application Default Credentials (ADC) in your environment with sufficient permissions.
-
Required Permissions The identity used by ADC must have the Monitoring Viewer role:
roles/monitoring.viewer
-
Environment Configuration
- GKE Cluster: Ensure the Pod is running with Workload Identity enabled and linked to a Google Service Account (GSA) with the required role.
- GCE VM: Ensure the VM's attached Service Account has the required role.
- Local Development: Authenticate using your user credentials:
gcloud auth application-default login
Note: Your personal user account must have the
monitoring.viewerrole on the target GCP project.
Common Error: Failing to configure these permissions will result in API errors similar to:
ERROR - error executing query: 403 Client Error: Forbidden for url: [https://monitoring.googleapis.com/v1/projects/](https://monitoring.googleapis.com/v1/projects/)...
Controls benchmark report generation:
report:
request_lifecycle:
summary: true # Generate high-level summary
per_stage: true # Include breakdown by load stage
per_request: false # Enable detailed per-request logs (verbose)
per_adapter: false # Generate metrics grouped by LoRA adapter
per_adapter_stage: false # Generate metrics grouped by adapter and stage
percentiles: [0.1, 1, 5, 10, 25, 50, 75, 90, 95, 99, 99.9] # List of percentiles to calculate
prometheus:
summary: true # Include Prometheus metrics summary
per_stage: false # Disable Prometheus stage breakdownConfigures storage for benchmark results:
storage:
local_storage:
path: "reports-{timestamp}" # Local directory path
report_file_prefix: null # Optional filename prefix
google_cloud_storage: # Optional GCS configuration
bucket_name: "your-bucket-name" # Required GCS bucket
path: "reports-{timestamp}" # Optional path prefix
report_file_prefix: null # Optional filename prefix
simple_storage_service:
bucket_name: "your-bucket-name" # Required S3 bucket
path: "reports-{timestamp}" # Optional path prefix
report_file_prefix: null # Optional filename prefixOptional tokenizer configuration for specialized tokenization:
tokenizer:
pretrained_model_name_or_path: "model-id" # Required model path
trust_remote_code: true # Whether to trust custom tokenizer code
token: "" # HuggingFace access token for private modelsdata:
type: shareGPT
load:
type: constant
stages:
- rate: 1
duration: 30
api:
type: chat
server:
type: vllm
model_name: HuggingFaceTB/SmolLM2-135M-Instruct
base_url: http://0.0.0.0:8000load:
type: constant
stages:
- rate: 1
duration: 30
api:
type: completion
server:
type: vllm
model_name: HuggingFaceTB/SmolLM2-135M-Instruct
base_url: http://0.0.0.0:8000
ignore_eos: true
tokenizer:
pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
data:
type: random
input_distribution:
min: 10 # min length of the synthetic prompts
max: 100 # max length of the synthetic prompts
mean: 50 # mean length of the synthetic prompts
std_dev: 10 # standard deviation of the length of the synthetic prompts
total_count: 100 # total number of prompts to generate to fit the above mentioned distribution constraints
output_distribution:
min: 10 # min length of the output to be generated
max: 100 # max length of the output to be generated
mean: 50 # mean length of the output to be generated
std_dev: 10 # standard deviation of the length of the output to be generated
total_count: 100 # total number of output lengths to generate to fit the above mentioned distribution constraints
metrics:
type: prometheus
prometheus:
url: http://localhost:9090
scrape_interval: 15
report:
request_lifecycle:
summary: true
per_stage: true
per_request: true
prometheus:
summary: true
per_stage: trueload:
type: constant
stages:
- rate: 1
duration: 30
api:
type: chat
server:
type: vllm
model_name: ./models/SmolLM2-135M-Instruct
base_url: http://0.0.0.0:8000
ignore_eos: true
tokenizer:
pretrained_model_name_or_path: ./models/SmolLM2-135M-Instruct
data:
type: shareGPT
path: ./data/shareGPT/ShareGPT_V3_unfiltered_cleaned_split.json # path to the downloaded shareGPT dataset
metrics:
type: prometheus
prometheus:
url: http://localhost:9090
scrape_interval: 15
report:
request_lifecycle:
summary: true
per_stage: true
per_request: false
prometheus:
summary: true
per_stage: true