diff --git a/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json index 296380f7..ffc906c8 100644 --- a/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json +++ b/vllm-benchmarks/benchmarks/hpu/latency-tests-hpu.json @@ -51,5 +51,59 @@ "max-model-len": 256, "async-scheduling": "" } + }, + { + "test_name": "latency_deepseek_r1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 2048, + "dtype": "bfloat16" + } + }, + { + "test_name": "latency_llama4_maverick_17b128e_instruct_fp8", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 2048, + "async-scheduling": "" + } + }, + { + "test_name": "latency_qwen3_8b", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "Qwen/Qwen3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15, + "max-model-len": 2048, + "dtype": "bfloat16", + "async-scheduling": "" + } } ] diff --git a/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json index 8c6b34bd..d532ba52 100644 --- a/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json +++ b/vllm-benchmarks/benchmarks/hpu/serving-tests-hpu.json @@ -78,5 +78,82 @@ "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200 } + }, + { + "test_name": "serving_deepseek_r1", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "load_format": "dummy", + "max-model-len": 2048, + "max-num-seqs": 200, + "async-scheduling": "", + "dtype": "bfloat16" + }, + "client_parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama4_maverick_17b128e_instruct_fp8", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "disable_log_stats": "", + "max-model-len": 2048, + "max-num-seqs": 128 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_qwen3_8b", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "server_parameters": { + "model": "Qwen/Qwen-3-8B", + "tensor_parallel_size": 1, + "swap_space": 16, + "dtype": "bfloat16", + "load_format": "dummy", + "disable_log_stats": "" + }, + "client_parameters": { + "model": "Qwen/Qwen-3-8B", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } } ] diff --git a/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json b/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json index 3127bf2f..2c8b927b 100644 --- a/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json +++ b/vllm-benchmarks/benchmarks/hpu/throughput-tests-hpu.json @@ -57,5 +57,64 @@ "max-num-seqs": 512, "async-scheduling": "" } + }, + { + "test_name": "throughput_deepseek_r1", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "deepseek-ai/DeepSeek-R1", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 128, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_llama4_maverick_17b128e_instruct_fp8", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "dataset_name": "sharegpt", + "num_prompts": 200, + "backend": "vllm", + "max-model-len": 2048, + "max-num-seqs": 128, + "async-scheduling": "" + } + }, + { + "test_name": "throughput_qwen3_8b", + "environment_variables": { + "PT_HPU_LAZY_MODE": 1, + "PT_HPU_ENABLE_LAZY_COLLECTIVES": 1, + "VLLM_CONTIGUOUS_PA": 1, + "VLLM_DEFRAG": 1 + }, + "parameters": { + "model": "Qwen/Qwen-3-8B", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "dataset_name": "sharegpt", + "num_prompts": 1000, + "backend": "vllm", + "async-scheduling": "" + } } ]