Skip to content

Add GPU profiler #1997

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 31 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
967ea76
Add profiler and Perfetto UI link with comprehensive tests (#1984, #1…
jainapurva Apr 1, 2025
dd9f50d
More models
jainapurva Apr 4, 2025
328b7bf
Update
jainapurva Apr 4, 2025
acc3c79
Reintroduce has_weight_zeros as a template param
metascroy Apr 1, 2025
77c4ef1
Claen up op interface
metascroy Apr 1, 2025
7959ac3
quantized matmul
kimishpatel Apr 2, 2025
0304a52
Allow builds on less than sm75 raise runtime failure (#1999)
drisspg Apr 2, 2025
3f89080
Skip galore test if not cuda (#2003)
jerryzh168 Apr 2, 2025
4e8f7f8
Fix experimental CI (#2005)
metascroy Apr 2, 2025
97f6618
Add fp32xint8 matmul
kimishpatel Apr 2, 2025
8f9bd0a
Add quantized q @ k test for intented used in quantized attention
kimishpatel Apr 2, 2025
2f62e01
Update version.txt (#2009)
jerryzh168 Apr 2, 2025
49705d9
Initial prototype of differentiable _scaled_grouped_mm function (#1969)
danielvegamyhre Apr 2, 2025
71a3d96
Add quantized attn_scores @ v test for intented used in quantized att…
kimishpatel Apr 3, 2025
50d133a
add fallback kernel and interface
kimishpatel Apr 3, 2025
d741ff0
Add fallback kernel and interface for rhs only quantized matmul
kimishpatel Apr 3, 2025
e190329
Add KleidiAI gemm kernels (#2000)
metascroy Apr 3, 2025
5409515
Update float8nocompile test code to use new float8 matmul function (#…
danielvegamyhre Apr 4, 2025
916f9d7
Remove float8nocompile CI (#1976)
danielvegamyhre Apr 4, 2025
0436d35
Update clean_release_notes.py (#2014)
jerryzh168 Apr 4, 2025
8ae4b6a
Match QAT prepare and convert numerics exactly (#1964)
andrewor14 Apr 4, 2025
90bff95
Skip failing tests for rowwise-scaled (#2022)
drisspg Apr 4, 2025
711d584
Update torchao.prototype.parq and add 4-bit Llama 3.2 1B benchmark (#…
lisjin Apr 4, 2025
ee2b9c7
Use quantized gemm only on aarch64
kimishpatel Apr 4, 2025
05ae22c
Adds utility to replace Q/DQ ops with torchao quantized linear ops (#…
metascroy Apr 5, 2025
e6f52ff
Fix slice and padding for TensorCoreTiledLayout (#2015)
jerryzh168 Apr 6, 2025
ae5fa0e
Fix Int4WeightEmbeddingQATQuantizer.convert path (#2024)
andrewor14 Apr 7, 2025
061fae4
Add gguf q4_k quantization (#2001)
jerryzh168 Apr 8, 2025
4b8a0d8
torch/ao
gmagogsfm Apr 8, 2025
da111e4
Adds Q/DQ layout support for embedding quantization with IntxWeightOn…
metascroy Apr 8, 2025
e367b21
Merge remote-tracking branch 'origin/main' into base-fix
jainapurva Apr 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 89 additions & 65 deletions benchmarks/microbenchmarks/benchmark_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
BenchmarkResult,
clean_caches,
create_model_and_input,
generate_memory_profile,
generate_model_profile,
model_inference_time_in_ms,
string_to_config,
)
Expand All @@ -29,70 +31,92 @@

def run(config: BenchmarkConfig) -> BenchmarkResult:
"""Run inference benchmarks"""
clean_caches() # Clean caches

# Create output directory if it doesn't exist
Path(config.output_dir).mkdir(parents=True, exist_ok=True)

base_model, input_data = create_model_and_input(
config.model_type,
config.m,
config.k,
config.n,
high_precision_dtype=config.high_precision_dtype,
device=config.device,
)

# Use quantize_ to apply each quantization function to the model
m_copy = deepcopy(base_model).eval().to(config.device)
ao_base_config = string_to_config(
config.quantization,
config.sparsity,
high_precision_dtype=config.high_precision_dtype,
)

# Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
is_cuda = config.device == "cuda" and torch.cuda.is_available()

if config.sparsity is not None and (
config.quantization is None or "baseline" in config.quantization
):
if is_cuda:
print(f"Applying {config.sparsity} sparsity to model")
sparsify_(m_copy, ao_base_config)
try:
clean_caches() # Clean caches

# Create output directory if it doesn't exist
Path(config.output_dir).mkdir(parents=True, exist_ok=True)

base_model, input_data = create_model_and_input(
config.model_type,
config.m,
config.k,
config.n,
high_precision_dtype=config.high_precision_dtype,
device=config.device,
)

# Use quantize_ to apply each quantization function to the model
m_copy = deepcopy(base_model).eval().to(config.device)
ao_base_config = string_to_config(
config.quantization,
config.sparsity,
high_precision_dtype=config.high_precision_dtype,
)

# Check if sparsity is requested and if the device is CUDA (sparsity operations require CUDA)
is_cuda = config.device == "cuda" and torch.cuda.is_available()

if config.sparsity is not None and (
config.quantization is None or "baseline" in config.quantization
):
if is_cuda:
print(f"Applying {config.sparsity} sparsity to model")
sparsify_(m_copy, ao_base_config)
else:
print(
f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
)
elif config.sparsity is None and (
config.quantization is None or "baseline" in config.quantization
):
pass # No quantization or sparsity specified, do nothing
else:
print(
f"Warning: Skipping {config.sparsity} sparsity as it requires CUDA, but device is {config.device}"
print("Quantizing model....")
quantize_(m_copy, ao_base_config)

if config.use_torch_compile:
print("Compiling model....")
m_copy = torch.compile(
m_copy, mode=config.torch_compile_mode, fullgraph=True
)
elif config.sparsity is None and (
config.quantization is None or "baseline" in config.quantization
):
pass # No quantization or sparsity specified, do nothing
else:
print("Quantizing model....")
quantize_(m_copy, ao_base_config)

if config.use_torch_compile:
print("Compiling model....")
m_copy = torch.compile(m_copy, mode=config.torch_compile_mode, fullgraph=True)

# Run benchmarks
result = BenchmarkResult(config=config)

# Benchmark time to run an inference call for quantized model
result.model_inference_time_in_ms = model_inference_time_in_ms(
model=m_copy, input_data=input_data
)

# TODO: Benchmark time using profiler
# Profile dtype model evaluation
# prof_dtype = benchmark_model_op_with_profiler_in_microseconds(m_copy, input_data, quantized_dtype)
# prof_dtype.export_chrome_trace(f"{quantization}_model_{input_data[0].size()[0]}.json") # Save profiling details

# TODO: Benchmark gemm time using cuda graph
# gemm_time = benchmark_torch_function_in_microseconds(gemm_op, *args, **kwargs)

# TODO: Benchmark op with cuda graph
# time = benchmark_op_with_cuda_graph(op, args)

return result

# Run benchmarks
result = BenchmarkResult(config=config)
# Store result in model for memory profiling
m_copy._benchmark_result = result

# Benchmark time to run an inference call for quantized model
result.model_inference_time_in_ms = model_inference_time_in_ms(
model=m_copy, input_data=input_data
)

# Run profiler if enabled
if config.enable_profiler:
print("Running profiler...")
try:
result.profiler_json_path, result.perfetto_url = generate_model_profile(
m_copy, input_data, config.profiler_file_name
)
except Exception as e:
print(f"Error running profiler: {e}")

# Run memory profiler if enabled
if config.enable_memory_profile:
print("Running memory profiler...")
try:
result.memory_profile_path, result.memory_stats = (
generate_memory_profile(
m_copy, input_data, config.memory_profile_file_name
)
)
except Exception as e:
print(f"Error running memory profiler: {e}")

return result
except Exception as e:
print(f"Error in benchmark run: {e}")
import traceback

print(traceback.format_exc())
return None
22 changes: 14 additions & 8 deletions benchmarks/microbenchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,22 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None
f"Running: {config.name} for Quantization: {config.quantization} and Sparsity: {config.sparsity}"
)
result = run_inference(config) # Pass the config object directly
results.append(result)
except Exception:
print(f"Error running benchmark {config.name}")
continue
if result is not None: # Only add successful results
results.append(result)
except Exception as e:
import traceback

# Add results to csv
generate_results_csv(results, configs[0].output_dir)
print(f"Error running benchmark {config.name} with error: {e}")
print(traceback.format_exc())
continue

# Print results
print_results(results)
# Add results to csv if there are any
if results:
generate_results_csv(results, configs[0].output_dir)
# Print results
print_results(results)
else:
print("No benchmark results were collected. All benchmarks failed.")

# TODO: Process results: Speedups:
# 1. For different shapes for same model and quantization
Expand Down
61 changes: 33 additions & 28 deletions benchmarks/microbenchmarks/test/benchmark_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,46 +2,51 @@
benchmark_mode: "inference"
quantization_config_recipe_names:
# Will run a baseline inference for model by default, without quantization for comparison
- "int4wo-32"
- "marlin"
sparsity_config_recipe_names:
# - "int4wo-32"
# - "marlin"
- "int8wo"
# sparsity_config_recipe_names:
# Will run a baseline inference for model by default, without sparsity for comparison
- "semi-sparse"
- "block"
# - "semi-sparse"
# - "block"
output_dir: "benchmarks/microbenchmarks/results"
model_params:
- name: "small_bf16_linear"
matrix_shapes:
- name: "custom"
shapes: [
[1024, 1024, 1024], # [m, k, n]
]
high_precision_dtype: "torch.bfloat16"
use_torch_compile: true
torch_compile_mode: "max-autotune"
device: "cuda"
model_type: "linear"
# - name: "small_bf16_linear"
# matrix_shapes:
# - name: "custom"
# shapes: [
# [1024, 1024, 1024], # [m, k, n]
# ]
# high_precision_dtype: "torch.bfloat16"
# use_torch_compile: true
# torch_compile_mode: "max-autotune"
# device: "cuda"
# model_type: "linear"
# enable_profiler: true # Enable profiling for this model

- name: "large_bf16_ln_linear"
matrix_shapes:
- name: "custom"
shapes: [
[2048, 4096, 1024],
[4096, 4096, 1024]
# [4096, 4096, 1024]
]
high_precision_dtype: "torch.bfloat16"
use_torch_compile: true
torch_compile_mode: "max-autotune"
device: "cuda"
model_type: "ln_linear_sigmoid"

- name: "cpu_fp32_linear"
matrix_shapes:
- name: "custom"
shapes: [
[4096, 4096, 1024]
]
high_precision_dtype: "torch.float32"
use_torch_compile: false
device: "cpu"
model_type: "linear"
enable_profiler: true # Enable profiling for this model
enable_memory_profile: true # Enable memory profiling for this model

# - name: "cpu_fp32_linear"
# matrix_shapes:
# - name: "custom"
# shapes: [
# [4096, 4096, 1024]
# ]
# high_precision_dtype: "torch.float32"
# use_torch_compile: false
# device: "cpu"
# model_type: "linear"
# enable_profiler: true # Enable profiling for this model
Loading
Loading