neuralmagic · sallyom · May 11, 2025 · sjmonson · Jun 17, 2025 · sjmonson
diff --git a/.gitignore b/.gitignore
@@ -178,3 +178,6 @@ cython_debug/
 # Project specific files
 *.json
 *.yaml
+
+# Allow specific YAML files
+!docs/guides/k8s/*.yaml
diff --git a/docs/guides/example-analysis/README.md b/docs/guides/example-analysis/README.md
@@ -0,0 +1,44 @@
+# GuideLLM Example Analysis
+
+This directory contains example analysis script for GuideLLM performance testing.
+
+## Running Benchmarks in Kubernetes
+
+To run comprehensive GuideLLM benchmarks in Kubernetes, follow the instructions in the [k8s/README.md](../k8s/README.md). This will help you:
+
+- Set up the necessary Kubernetes environment
+- Configure benchmark parameters
+- Execute the benchmarks
+- Collect performance data
+
+## Analyzing Results
+
+### Using the Analysis Script
+
+The [analyze_benchmarks.py](./analyze_benchmarks.py) script processes benchmark YAML output and generates visualizations and statistics. To use it:
+
+1. Install required dependencies:
+
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+2. Ensure the GuideLLM benchmark YAML file from the Kubernetes guidellm-job pod is copied to your local environment.
+
+   ```bash
+   # From the k8s/README.md instructions
+   kubectl cp <pod-name>:/path/to/benchmark.yaml ./llama32-3b.yaml
+   ```
+
+3. Run the analysis script (make sure the YAML file is in the same directory):
+
+   ```bash
+   python analyze_benchmarks.py
+   ```
+
+The script will:
+
+- Process the benchmark YAML file
+- Generate visualizations in the `benchmark_plots` directory
+- Create a CSV file with processed metrics
+- Print summary statistics
diff --git a/docs/guides/example-analysis/analyze_benchmarks.py b/docs/guides/example-analysis/analyze_benchmarks.py
@@ -0,0 +1,140 @@
+import yaml
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from pathlib import Path
+
+def process_benchmark_yaml(yaml_file):
+    """Process the benchmark YAML file and return a DataFrame with the data."""
+    with open(yaml_file, 'r') as f:
+        data = yaml.safe_load(f)
+
+    # Extract concurrency levels from the benchmark configuration
+    concurrency_levels = data['benchmarks'][0]['args']['profile']['measured_concurrencies']
+
+    # Process metrics for each concurrency level
+    processed_data = []
+    for i, benchmark in enumerate(data['benchmarks']):
+        if 'metrics' in benchmark:
+            metrics = benchmark['metrics']
+            concurrency = concurrency_levels[i] if i < len(concurrency_levels) else 1.0
+
+            # Extract successful metrics
+            for metric_name, metric_data in metrics.items():
+                if 'successful' in metric_data:
+                    successful = metric_data['successful']
+                    processed_data.append({
+                        'concurrency': concurrency,
+                        'metric': metric_name,
+                        'count': successful.get('count', 0),
+                        'mean': successful.get('mean', 0),
+                        'median': successful.get('median', 0),
+                        'min': successful.get('min', 0),
+                        'max': successful.get('max', 0),
+                        'std_dev': successful.get('std_dev', 0),
+                        'p95': successful.get('percentiles', {}).get('p95', 0),
+                        'p99': successful.get('percentiles', {}).get('p99', 0)
+                    })
+
+    # Convert to DataFrame
+    df = pd.DataFrame(processed_data)
+    return df
+
+def create_visualizations(df):
+    """Create visualizations for the benchmark data."""
+    # Create plots directory if it doesn't exist
+    plot_dir = Path('benchmark_plots')
+    plot_dir.mkdir(exist_ok=True)
+
+    # Set style
+    plt.style.use('default')
+
+    # Sort by concurrency for better visualization
+    df = df.sort_values('concurrency')
+
+    # Create visualizations for each metric
+    metrics_to_plot = [
+        'request_latency',
+        'time_to_first_token_ms',
+        'tokens_per_second',
+        'inter_token_latency_ms'
+    ]
+
+    for metric in metrics_to_plot:
+        metric_df = df[df['metric'] == metric]
+        if not metric_df.empty:
+            # Mean vs Median
+            plt.figure(figsize=(12, 6))
+            plt.plot(metric_df['concurrency'], metric_df['mean'], 'b-', label='Mean')
+            plt.plot(metric_df['concurrency'], metric_df['median'], 'r--', label='Median')
+            plt.title(f'{metric.replace("_", " ").title()} vs Concurrency')
+            plt.xlabel('Concurrency Level')
+            plt.ylabel('Value')
+            plt.legend()
+            plt.grid(True)
+            plt.tight_layout()
+            plt.savefig(plot_dir / f'{metric}_mean_median.png')
+            plt.close()
+
+            # Min-Max Range
+            plt.figure(figsize=(12, 6))
+            plt.fill_between(metric_df['concurrency'], 
+                           metric_df['min'], 
+                           metric_df['max'], 
+                           alpha=0.3, 
+                           label='Min-Max Range')
+            plt.plot(metric_df['concurrency'], metric_df['mean'], 'b-', label='Mean')
+            plt.title(f'{metric.replace("_", " ").title()} Range vs Concurrency')
+            plt.xlabel('Concurrency Level')
+            plt.ylabel('Value')
+            plt.legend()
+            plt.grid(True)
+            plt.tight_layout()
+            plt.savefig(plot_dir / f'{metric}_range.png')
+            plt.close()
+
+            # Percentiles
+            plt.figure(figsize=(12, 6))
+            plt.plot(metric_df['concurrency'], metric_df['p95'], 'g--', label='95th Percentile')
+            plt.plot(metric_df['concurrency'], metric_df['p99'], 'r--', label='99th Percentile')
+            plt.plot(metric_df['concurrency'], metric_df['mean'], 'b-', label='Mean')
+            plt.title(f'{metric.replace("_", " ").title()} Percentiles vs Concurrency')
+            plt.xlabel('Concurrency Level')
+            plt.ylabel('Value')
+            plt.legend()
+            plt.grid(True)
+            plt.tight_layout()
+            plt.savefig(plot_dir / f'{metric}_percentiles.png')
+            plt.close()
+
+def main():
+    # Process the YAML file
+    df = process_benchmark_yaml('llama32-3b.yaml')
+
+    # Create visualizations
+    create_visualizations(df)
+
+    # Print summary statistics by concurrency level
+    print("\nSummary Statistics by Concurrency Level:")
+    for concurrency in sorted(df['concurrency'].unique()):
+        print(f"\nConcurrency Level: {concurrency:.2f}")
+        subset = df[df['concurrency'] == concurrency]
+
+        for metric in subset['metric'].unique():
+            metric_data = subset[subset['metric'] == metric]
+            print(f"\n{metric.replace('_', ' ').title()}:")
+            print(f"Count: {metric_data['count'].iloc[0]}")
+            print(f"Mean: {metric_data['mean'].iloc[0]:.2f}")
+            print(f"Median: {metric_data['median'].iloc[0]:.2f}")
+            print(f"Min: {metric_data['min'].iloc[0]:.2f}")
+            print(f"Max: {metric_data['max'].iloc[0]:.2f}")
+            print(f"Std Dev: {metric_data['std_dev'].iloc[0]:.2f}")
+            print(f"95th Percentile: {metric_data['p95'].iloc[0]:.2f}")
+            print(f"99th Percentile: {metric_data['p99'].iloc[0]:.2f}")
+
+    # Save processed data
+    df.to_csv('benchmark_processed_data.csv', index=False)
+    print("\nProcessed data saved to benchmark_processed_data.csv")
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/guides/example-analysis/requirements.txt b/docs/guides/example-analysis/requirements.txt
@@ -0,0 +1,4 @@
+pyyaml>=6.0
+pandas>=2.0.0
+matplotlib>=3.7.0
+seaborn>=0.12.0 
diff --git a/docs/guides/k8s/Dockerfile b/docs/guides/k8s/Dockerfile
@@ -0,0 +1,12 @@
+FROM registry.access.redhat.com/ubi9/python-312:9.5-1744198409
+
+RUN pip install --upgrade pip && \
+    pip install git+https://github.com/neuralmagic/guidellm.git@main
+
+# Replace these env vars in the guidellm-job.yaml
+ENV TARGET=http://localhost:8000/v1 \
+    MODEL=neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 \
+    DATA_TYPE=emulated \
+    DATA=prompt_tokens=512,generated_tokens=128
+
+ENTRYPOINT ["guidellm"]
diff --git a/docs/guides/k8s/README.md b/docs/guides/k8s/README.md
@@ -0,0 +1,53 @@
+## Run Guidellm with Kubernetes Job
-## Run Guidellm with Kubernetes Job
+## Run GuideLLM with Kubernetes Job
-## Run Guidellm with Kubernetes Job
+## Run GuideLLM with Kubernetes Job
+
+Here's an example to run `guidellm` with `meta-llama/Llama-3.2-3B-Instruct` that has been deployed with
+[llm-d-deployer](https://github.com/neuralmagic/llm-d-deployer/blob/main/quickstart/README-minikube.md).
+Replace the `--target` and references to `Llama-3.2-3B` in [guidellm-job.yaml](./guidellm-job.yaml) to evaluate any served LLM.
+
+### Run evaluation
+
+```bash
+# Update the claim-name in accessor-pod.yaml, and guidellm-job.yaml if using a different pvc-name
+kubectl apply -f pvc.yaml
+kubectl apply -f guidellm-job.yaml
+```
+
+> **📝 NOTE:** [Dockerfile](./Dockerfile) was used to build the image for the guidellm-job pod.
+
-> **📝 NOTE:** [Dockerfile](./Dockerfile) was used to build the image for the guidellm-job pod.
-> **📝 NOTE:** [Dockerfile](./Dockerfile) was used to build the image for the guidellm-job pod.
+> **📝 NOTE:** The HF_TOKEN is passed to the job, but this will not be necessary if you use the same PVC as the one storing your model.
+> Guidellm uses the model's tokenizer/processor files in its evaluation. You can pass a path instead with `--tokenizer=/path/to/model`.
+> This eliminates the need for Guidellm to download the files from Huggingface.
-> This eliminates the need for Guidellm to download the files from Huggingface.
+> This eliminates the need for GuideLLM to download the files from Hugging Face.
-> This eliminates the need for Guidellm to download the files from Huggingface.
+> This eliminates the need for GuideLLM to download the files from Hugging Face.
+
+The logs from the job will show pretty tables that summarize the results. There is also a large yaml file created. The evaluation for this model
+will take ~20-30 minutes.
+
+### Extract Guidellm Report
+
+```bash
+kubectl apply -f accessor-pod.yaml
+
+# Wait for the pod to be ready
+kubectl wait --for=condition=Ready pod/guidellm-accessor
+
+# Copy the report file from the pod (accessor pod mounts the volume as read-only)
+kubectl cp guidellm-accessor:/app/data/guidellm-reports.tgz ./guidellm-reports.tgz
+```
+
+Extract the report:
+
+```bash
+tar -xvf guidellm-reports.tgz
+```
+
+You will now have a local file `./guidellm-reports/llama32-3b.yaml`
+
+You can remove the accessor pod with:
+
+```bash
+kubectl delete pod guidellm-accessor
+```
+
+### Gather Insights from Guidellm Report
+
+You can follow the ["Analyzing Results" section](../example-analysis/README.md#analyzing-results) to gain insights from your LLM
+deployments using the GuideLLM report.
diff --git a/docs/guides/k8s/accessor-pod.yaml b/docs/guides/k8s/accessor-pod.yaml
@@ -0,0 +1,20 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: guidellm-accessor
+spec:
+  containers:
+  - command:
+    - sleep
+    - "3600"
+    image: registry.access.redhat.com/ubi9/ubi
+    name: accessor
+    volumeMounts:
+    - mountPath: /app/data
+      name: output
+      readOnly: true
+  volumes:
+  - name: output
+    persistentVolumeClaim:
+      claimName: guidellm-output-pvc
+      readOnly: true
diff --git a/docs/guides/k8s/guidellm-job.yaml b/docs/guides/k8s/guidellm-job.yaml
@@ -0,0 +1,84 @@
+# This job takes ~25min to complete.
+# This will create a very large yaml file. To extract the file, run:
+# oc apply -f accessor-pod.yaml
+# mkdir ./guidellm-reports
+# kubectl cp guidellm-accessor:/app/data/guidellm-reports.tgz ./guidellm-reports/guidellm-reports.tgz
+# You will now have a local ./guidellm-reports/guidellm-reports.tgz, to extract it run:
+# tar -xvf guidellm-reports.tgz
+# You will now have a local file ./guidellm-reports/llama32-3b.yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: run-guidellm
+spec:
+  template:
+    spec:
+      containers:
+      - name: guidellm
+        # TODO: replace this image
+        image: quay.io/sallyom/guidellm:latest
+        imagePullPolicy: IfNotPresent
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+              - ALL
+          runAsNonRoot: true
+          seccompProfile:
+            type: RuntimeDefault
+        args:
+        - benchmark
+        - --target=$(TARGET)
+        - --data=$(DATA)
+        - --rate-type=sweep
+        - --model=$(MODEL)
+        - --output-path=/app/data/llama32-3b.yaml
+        env:
+        # HF_TOKEN is not necessary if you share/use the model PVC. Guidellm needs to access the tokenizer file.
+        # You can provide a path to the tokenizer file by passing `--tokenizer=/path/to/model`. If you do not
+        # pass the tokenizer path, Guidellm will get the tokenizer file(s) from Huggingface.
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: HF_TOKEN
+              name: huggingface-secret
+        - name: TARGET
+          value: "http://llm-d-inference-gateway.llm-d.svc.cluster.local:80/v1"
+        - name: DATA_TYPE
+          value: "emulated"
+        - name: DATA
+          value: "prompt_tokens=512,output_tokens=128"
+        - name: MODEL
+          value: "meta-llama/Llama-3.2-3B-Instruct"
+        volumeMounts:
+        - name: output
+          mountPath: /app/data
-      - name: guidellm
-        # TODO: replace this image
-        image: quay.io/sallyom/guidellm:latest
-        imagePullPolicy: IfNotPresent
-        securityContext:
-          allowPrivilegeEscalation: false
-          capabilities:
-            drop:
-              - ALL
-          runAsNonRoot: true
-          seccompProfile:
-            type: RuntimeDefault
-        args:
-        - benchmark
-        - --target=$(TARGET)
-        - --data=$(DATA)
-        - --rate-type=sweep
-        - --model=$(MODEL)
-        - --output-path=/app/data/llama32-3b.yaml
-        env:
-        # HF_TOKEN is not necessary if you share/use the model PVC. Guidellm needs to access the tokenizer file.
-        # You can provide a path to the tokenizer file by passing `--tokenizer=/path/to/model`. If you do not
-        # pass the tokenizer path, Guidellm will get the tokenizer file(s) from Huggingface.
-        - name: HF_TOKEN
-          valueFrom:
-            secretKeyRef:
-              key: HF_TOKEN
-              name: huggingface-secret
-        - name: TARGET
-          value: "http://llm-d-inference-gateway.llm-d.svc.cluster.local:80/v1"
-        - name: DATA_TYPE
-          value: "emulated"
-        - name: DATA
-          value: "prompt_tokens=512,output_tokens=128"
-        - name: MODEL
-          value: "meta-llama/Llama-3.2-3B-Instruct"
-        volumeMounts:
-        - name: output
-          mountPath: /app/data
+      - name: guidellm
+        image: ghcr.io/neuralmagic/guidellm:latest
+        imagePullPolicy: IfNotPresent
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+              - ALL
+          runAsNonRoot: true
+          seccompProfile:
+            type: RuntimeDefault
+        env:
+        # HF_TOKEN is not necessary if you share/use the model PVC. Guidellm needs to access the tokenizer file.
+        # You can provide a path to the tokenizer file by passing `--tokenizer=/path/to/model`. If you do not
+        # pass the tokenizer path, Guidellm will get the tokenizer file(s) from Huggingface.
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: HF_TOKEN
+              name: huggingface-secret
+        - name: GUIDELLM_TARGET
+          value: "http://llm-d-inference-gateway.llm-d.svc.cluster.local:80"
+        - name: GUIDELLM_RATE_TYPE
+          value: "sweep"
+        - name: GUIDELLM_DATA
+          value: "prompt_tokens=512,output_tokens=128"
+        - name: GUIDELLM_MODEL
+          value: "meta-llama/Llama-3.2-3B-Instruct"
+        volumeMounts:
+        - name: output
+          mountPath: /app/data
-      - name: guidellm
-        # TODO: replace this image
-        image: quay.io/sallyom/guidellm:latest
-        imagePullPolicy: IfNotPresent
-        securityContext:
-          allowPrivilegeEscalation: false
-          capabilities:
-            drop:
-              - ALL
-          runAsNonRoot: true
-          seccompProfile:
-            type: RuntimeDefault
-        args:
-        - benchmark
-        - --target=$(TARGET)
-        - --data=$(DATA)
-        - --rate-type=sweep
-        - --model=$(MODEL)
-        - --output-path=/app/data/llama32-3b.yaml
-        env:
-        # HF_TOKEN is not necessary if you share/use the model PVC. Guidellm needs to access the tokenizer file.
-        # You can provide a path to the tokenizer file by passing `--tokenizer=/path/to/model`. If you do not
-        # pass the tokenizer path, Guidellm will get the tokenizer file(s) from Huggingface.
-        - name: HF_TOKEN
-          valueFrom:
-            secretKeyRef:
-              key: HF_TOKEN
-              name: huggingface-secret
-        - name: TARGET
-          value: "http://llm-d-inference-gateway.llm-d.svc.cluster.local:80/v1"
-        - name: DATA_TYPE
-          value: "emulated"
-        - name: DATA
-          value: "prompt_tokens=512,output_tokens=128"
-        - name: MODEL
-          value: "meta-llama/Llama-3.2-3B-Instruct"
-        volumeMounts:
-        - name: output
-          mountPath: /app/data
+      - name: guidellm
+        image: ghcr.io/neuralmagic/guidellm:latest
+        imagePullPolicy: IfNotPresent
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+              - ALL
+          runAsNonRoot: true
+          seccompProfile:
+            type: RuntimeDefault
+        env:
+        # HF_TOKEN is not necessary if you share/use the model PVC. Guidellm needs to access the tokenizer file.
+        # You can provide a path to the tokenizer file by passing `--tokenizer=/path/to/model`. If you do not
+        # pass the tokenizer path, Guidellm will get the tokenizer file(s) from Huggingface.
+        - name: HF_TOKEN
+          valueFrom:
+            secretKeyRef:
+              key: HF_TOKEN
+              name: huggingface-secret
+        - name: GUIDELLM_TARGET
+          value: "http://llm-d-inference-gateway.llm-d.svc.cluster.local:80"
+        - name: GUIDELLM_RATE_TYPE
+          value: "sweep"
+        - name: GUIDELLM_DATA
+          value: "prompt_tokens=512,output_tokens=128"
+        - name: GUIDELLM_MODEL
+          value: "meta-llama/Llama-3.2-3B-Instruct"
+        volumeMounts:
+        - name: output
+          mountPath: /app/data
+      - name: extract
+        image: registry.access.redhat.com/ubi9/ubi
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop:
+              - ALL
+          runAsNonRoot: true
+          seccompProfile:
+            type: RuntimeDefault
+        command: ["sh", "-c"]
+        args:
+        - |
+          echo "Waiting for guidellm container to complete...";
+          while [ ! -f /app/data/llama32-3b.yaml ]; do
+            sleep 60;
+          done;
+          echo "Guidellm completed, packing reports...";
+          cd /app/data && \
+          tar czf guidellm-reports.tgz *.yaml && \
+          rm /app/data/llama32-3b.yaml
+        volumeMounts:
+        - name: output
+          mountPath: /app/data
+      restartPolicy: Never
+      volumes:
+      - name: output
+        persistentVolumeClaim:
+          claimName: guidellm-output-pvc
diff --git a/docs/guides/k8s/pvc.yaml b/docs/guides/k8s/pvc.yaml
@@ -0,0 +1,11 @@
+# Example PVC - update to match your cluster
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: guidellm-output-pvc
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 2Gi