From 418d2f8bfb5593bce89641d79849900f7294b859 Mon Sep 17 00:00:00 2001 From: Ekagra Ranjan <3116519+ekagra-ranjan@users.noreply.github.com> Date: Wed, 14 May 2025 15:31:46 -0400 Subject: [PATCH 01/58] [V1][Spec Decode] Share input embedding of target model with EAGLE draft model to free ~1GB for llama 3 model (#17326) Co-authored-by: root Co-authored-by: Woosuk Kwon --- examples/offline_inference/eagle.py | 7 ++++++ vllm/model_executor/models/llama_eagle.py | 27 +++++++++++++------- vllm/model_executor/models/llama_eagle3.py | 15 +++++++---- vllm/v1/spec_decode/eagle.py | 29 ++++++++++++++++++---- 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py index 020521611f33..615f67e9f8d8 100644 --- a/examples/offline_inference/eagle.py +++ b/examples/offline_inference/eagle.py @@ -105,6 +105,13 @@ def main(): outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params) + # print the generated text + for output in outputs: + print("-" * 50) + print(f"prompt: {output.prompt}") + print(f"generated text: {output.outputs[0].text}") + print("-" * 50) + if not hasattr(outputs, "metrics") or outputs.metrics is None: return diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 76655bd71b15..4e51daa220e4 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -8,6 +8,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.vocab_parallel_embedding import ( @@ -52,11 +53,15 @@ def __init__( self.config = vllm_config. \ speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size - self.embed_tokens = VocabParallelEmbedding( - self.config.vocab_size, - self.config.hidden_size, - prefix=maybe_prefix(prefix, "embed_tokens"), - ) + + # if PP disabled then draft will share embed with target + if get_pp_group().world_size > 1: + self.embed_tokens = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + prefix=maybe_prefix(prefix, "embed_tokens"), + ) + self.layers = nn.ModuleList([ LlamaDecoderLayer( self.config, @@ -109,6 +114,12 @@ def load_weights(self, weights: Iterable[Tuple[str, weight_loader(param, loaded_weight, shard_id) break else: + + # if PP disabled then draft will share embed with target + if get_pp_group().world_size == 1 and \ + "embed_tokens." in name: + continue + param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) @@ -142,8 +153,7 @@ def forward( def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, - skip_prefixes=(["lm_head."] - if self.config.tie_word_embeddings else None), + skip_prefixes=None, ) model_weights = {} @@ -151,5 +161,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if "lm_head" not in name: name = "model." + name model_weights[name] = loaded_weight - - loader.load_weights(model_weights.items()) + return loader.load_weights(model_weights.items()) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 904ff3210943..9761c8389db2 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -8,6 +8,7 @@ from vllm.compilation.decorators import support_torch_compile from vllm.config import VllmConfig +from vllm.distributed.parallel_state import get_pp_group from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import QKVParallelLinear @@ -91,11 +92,15 @@ def __init__( self.config = vllm_config. \ speculative_config.draft_model_config.hf_config self.vocab_size = self.config.vocab_size - self.embed_tokens = VocabParallelEmbedding( - self.config.vocab_size, - self.config.hidden_size, - prefix=maybe_prefix(prefix, "embed_tokens"), - ) + + # if PP disabled then draft will share embed with target + if get_pp_group().world_size > 1: + self.embed_tokens = VocabParallelEmbedding( + self.config.vocab_size, + self.config.hidden_size, + prefix=maybe_prefix(prefix, "embed_tokens"), + ) + self.layers = nn.ModuleList([ LlamaDecoderLayer( self.config, diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 8af8fda3968c..5b84bc1f5ec3 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -5,6 +5,7 @@ from vllm.attention.layer import Attention from vllm.config import (CompilationLevel, VllmConfig, get_layers_from_vllm_config, set_current_vllm_config) +from vllm.distributed.parallel_state import get_pp_group from vllm.forward_context import set_forward_context from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model_loader @@ -306,12 +307,30 @@ def load_model(self, target_model: nn.Module) -> None: self.attn_layer_name = next(iter(draft_attn_layer_names)) loaded_weights = self.model.load_weights( loader.get_all_weights(draft_model_config, self.model)) - if self.vllm_config.speculative_config.method == "eagle3": - if "model.embed_tokens.weight" not in loaded_weights: - logger.info( - "Loading EAGLE embedding weights from the target model.") - self.model.model.embed_tokens = target_model.model.embed_tokens + + # share embed_tokens with the target model if needed + if get_pp_group().world_size == 1: + assert "model.embed_tokens.weight" not in loaded_weights, \ + "For PP = 1, Eagle draft should share embed with target model" + logger.info( + "The EAGLE head shares the same vocab embedding" \ + " with the target model." + ) + self.model.model.embed_tokens = target_model.model.embed_tokens else: + assert "model.embed_tokens.weight" in loaded_weights, \ + "For PP > 1, Eagle draft checkpoint should its own copy of " + " the model.embed_tokens.weight" + logger.info( + "Since PP > 1, the EAGLE head loaded its own vocab embedding" \ + " weights instead of sharing them with the target model." + ) + + # share lm_head with the target model if needed + # some model definition do not define lm_head explicitly + # and reuse embed_tokens for lm_head, e.g., CohereForCausalLM + if self.vllm_config.speculative_config.method != "eagle3" and \ + hasattr(target_model, "lm_head"): logger.info("Loading EAGLE LM head weights from the target model.") self.model.lm_head = target_model.lm_head From f9c069c85e029830094ff9abb926ffbf37b7c7e7 Mon Sep 17 00:00:00 2001 From: bnellnm <49004751+bnellnm@users.noreply.github.com> Date: Wed, 14 May 2025 16:11:54 -0400 Subject: [PATCH 02/58] Modularize fused experts and integrate PPLX kernels (#15956) --- csrc/activation_kernels.cu | 3 + csrc/dispatch_utils.h | 14 + csrc/moe/moe_align_sum_kernels.cu | 8 +- csrc/moe/topk_softmax_kernels.cu | 63 +- examples/offline_inference/data_parallel.py | 22 +- tests/kernels/moe/test_batched_moe.py | 114 +++ tests/kernels/moe/test_cutlass_moe.py | 46 +- tests/kernels/moe/test_moe.py | 93 ++- tests/kernels/moe/test_pplx_moe.py | 691 ++++++++++++++++ tests/kernels/moe/test_triton_moe_ptpc_fp8.py | 34 +- tests/kernels/quantization/test_block_fp8.py | 20 +- tests/kernels/quantization/test_block_int8.py | 5 +- vllm/distributed/parallel_state.py | 53 +- vllm/distributed/utils.py | 13 +- vllm/forward_context.py | 5 +- .../layers/fused_moe/__init__.py | 5 +- .../layers/fused_moe/cutlass_moe.py | 303 ++++--- .../layers/fused_moe/deep_gemm_moe.py | 329 +++----- .../layers/fused_moe/fused_batched_moe.py | 755 ++++++++++++++++++ .../layers/fused_moe/fused_moe.py | 388 ++++++--- vllm/model_executor/layers/fused_moe/layer.py | 551 +++++++++++-- .../layers/fused_moe/modular_kernel.py | 364 +++++++++ .../layers/fused_moe/moe_permute_unpermute.py | 90 ++- .../layers/fused_moe/pplx_prepare_finalize.py | 147 ++++ .../layers/fused_moe/prepare_finalize.py | 60 ++ .../layers/fused_moe/triton_deep_gemm_moe.py | 112 +++ vllm/model_executor/layers/fused_moe/utils.py | 59 +- .../model_executor/layers/quantization/fp8.py | 84 +- vllm/model_executor/models/dbrx.py | 1 - vllm/model_executor/models/deepseek_v2.py | 14 +- vllm/model_executor/models/llama4.py | 8 +- vllm/model_executor/models/qwen2_moe.py | 9 +- vllm/model_executor/models/qwen3_moe.py | 6 +- vllm/platforms/cuda.py | 1 + vllm/v1/attention/backends/mla/common.py | 6 +- vllm/v1/worker/gpu_worker.py | 3 +- vllm/v1/worker/tpu_worker.py | 3 +- vllm/worker/cpu_worker.py | 3 +- vllm/worker/hpu_worker.py | 6 +- vllm/worker/tpu_worker.py | 3 +- vllm/worker/worker.py | 3 +- vllm/worker/xpu_worker.py | 3 +- 42 files changed, 3835 insertions(+), 665 deletions(-) create mode 100644 tests/kernels/moe/test_batched_moe.py create mode 100644 tests/kernels/moe/test_pplx_moe.py create mode 100644 vllm/model_executor/layers/fused_moe/fused_batched_moe.py create mode 100644 vllm/model_executor/layers/fused_moe/modular_kernel.py create mode 100644 vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py create mode 100644 vllm/model_executor/layers/fused_moe/prepare_finalize.py create mode 100644 vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu index 88275dbdd83a..55e659679701 100644 --- a/csrc/activation_kernels.cu +++ b/csrc/activation_kernels.cu @@ -70,6 +70,9 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) { int64_t num_tokens = input.numel() / input.size(-1); \ dim3 grid(num_tokens); \ dim3 block(std::min(d, 1024)); \ + if (num_tokens == 0) { \ + return; \ + } \ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ VLLM_DISPATCH_FLOATING_TYPES( \ diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h index dc6e0769b878..f7b75c48373f 100644 --- a/csrc/dispatch_utils.h +++ b/csrc/dispatch_utils.h @@ -65,5 +65,19 @@ AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) +#define VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(...) \ + AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::UInt16, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::UInt32, __VA_ARGS__) \ + AT_DISPATCH_CASE(at::ScalarType::UInt64, __VA_ARGS__) + #define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__)) + +#define VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_SWITCH( \ + TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_AND_UNSIGNED_TYPES(__VA_ARGS__)) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index d7be769458e3..6b6a9d04a60f 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -326,7 +326,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, } if (use_global_memory) { - VLLM_DISPATCH_INTEGRAL_TYPES( + VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` // tensors @@ -351,7 +351,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, cumsum_buffer.data_ptr()); }); } else if (use_i16) { - VLLM_DISPATCH_INTEGRAL_TYPES( + VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { // set dynamic shared mem auto kernel = @@ -366,7 +366,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, topk_ids.numel()); }); } else { - VLLM_DISPATCH_INTEGRAL_TYPES( + VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { auto kernel = vllm::moe::moe_align_block_size_kernel; @@ -391,7 +391,7 @@ void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, TORCH_CHECK(num_experts == 256, "sgl_moe_align_block_size kernel only supports deepseek v3."); - VLLM_DISPATCH_INTEGRAL_TYPES( + VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES( topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] { // calc needed amount of shared mem for `cumsum` tensors auto options_int = diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu index de9747b60252..a9379032245d 100644 --- a/csrc/moe/topk_softmax_kernels.cu +++ b/csrc/moe/topk_softmax_kernels.cu @@ -108,9 +108,17 @@ __launch_bounds__(TPB) __global__ } } -template -__launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax, const bool* finished, float* output, - int* indices, int* source_rows, const int num_experts, const int k, const int start_expert, const int end_expert) +template +__launch_bounds__(TPB) __global__ void moeTopK( + const float* inputs_after_softmax, + const bool* finished, + float* output, + IndType* indices, + int* source_rows, + const int num_experts, + const int k, + const int start_expert, + const int end_expert) { using cub_kvp = cub::KeyValuePair; @@ -182,9 +190,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax 2) This implementation assumes k is small, but will work for any k. */ -template +template __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ - void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, int* indices, + void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, IndType* indices, int* source_rows, const int k, const int start_expert, const int end_expert) { // We begin by enforcing compile time assertions and setting up compile time constants. @@ -397,8 +405,8 @@ struct TopkConstants }; } // namespace detail -template -void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, int* indices, +template +void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices, int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream) { static constexpr std::size_t MAX_BYTES_PER_LDG = 16; @@ -421,10 +429,11 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f token_expert_indices, num_tokens, topk, 0, num_experts, \ stream); +template void topkGatingSoftmaxKernelLauncher( const float* gating_output, float* topk_weights, - int* topk_indicies, + IndType* topk_indicies, int* token_expert_indices, float* softmax_workspace, const int num_tokens, @@ -493,14 +502,32 @@ void topk_softmax( const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output)); const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options()); - vllm::moe::topkGatingSoftmaxKernelLauncher( - gating_output.data_ptr(), - topk_weights.data_ptr(), - topk_indices.data_ptr(), - token_expert_indices.data_ptr(), - softmax_workspace.data_ptr(), - num_tokens, - num_experts, - topk, - stream); + + if(topk_indices.scalar_type() == at::ScalarType::Int) + { + vllm::moe::topkGatingSoftmaxKernelLauncher( + gating_output.data_ptr(), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, + num_experts, + topk, + stream); + } + else + { + assert(topk_indices.scalar_type() == at::ScalarType::UInt32); + vllm::moe::topkGatingSoftmaxKernelLauncher( + gating_output.data_ptr(), + topk_weights.data_ptr(), + topk_indices.data_ptr(), + token_expert_indices.data_ptr(), + softmax_workspace.data_ptr(), + num_tokens, + num_experts, + topk, + stream); + } } diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py index 965915beaf58..f636a08c0b09 100644 --- a/examples/offline_inference/data_parallel.py +++ b/examples/offline_inference/data_parallel.py @@ -65,11 +65,17 @@ def parse_args(): type=int, default=0, help="Master node port") + parser.add_argument("--enforce-eager", + action='store_true', + help="Enforce eager mode execution.") + parser.add_argument("--trust-remote-code", + action='store_true', + help="Trust remote code.") return parser.parse_args() def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, - dp_master_port, GPUs_per_dp_rank): + dp_master_port, GPUs_per_dp_rank, enforce_eager, trust_remote_code): os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank) os.environ["VLLM_DP_SIZE"] = str(dp_size) @@ -109,10 +115,13 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, max_tokens=[16, 20][global_dp_rank % 2]) # Create an LLM. - llm = LLM(model=model, - tensor_parallel_size=GPUs_per_dp_rank, - enforce_eager=True, - enable_expert_parallel=True) + llm = LLM( + model=model, + tensor_parallel_size=GPUs_per_dp_rank, + enforce_eager=enforce_eager, + enable_expert_parallel=True, + trust_remote_code=trust_remote_code, + ) outputs = llm.generate(prompts, sampling_params) # Print the outputs. for i, output in enumerate(outputs): @@ -155,7 +164,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, proc = Process(target=main, args=(args.model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port, - tp_size)) + tp_size, args.enforce_eager, + args.trust_remote_code)) proc.start() procs.append(proc) exit_code = 0 diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py new file mode 100644 index 000000000000..7d369edfc86a --- /dev/null +++ b/tests/kernels/moe/test_batched_moe.py @@ -0,0 +1,114 @@ +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass + +import pytest +import torch +import triton.language as tl + +from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + invoke_moe_batched_triton_kernel) + + +@dataclass +class BatchedMMConfig: + dtype: torch.dtype + num_experts: int + max_tokens_per_expert: int + K: int + N: int + + +@dataclass +class BatchedMMTensors: + A: torch.Tensor # [E, max_tokens, K] + B: torch.Tensor # [E, K, N] - column major + C: torch.Tensor # [E, max_tokens, N] + num_expert_tokens: torch.Tensor # [E] + + @staticmethod + def make_tensors(config: BatchedMMConfig): + A = torch.randn( + (config.num_experts, config.max_tokens_per_expert, config.K), + device="cuda", + dtype=config.dtype) / 10 + B = torch.randn((config.num_experts, config.N, config.K), + device="cuda", + dtype=config.dtype) + C = torch.zeros( + (config.num_experts, config.max_tokens_per_expert, config.N), + device="cuda", + dtype=config.dtype) + num_expert_tokens = torch.randint(low=0, + high=config.max_tokens_per_expert, + size=(config.num_experts, ), + device="cuda", + dtype=torch.int32) + return BatchedMMTensors(A, B, C, num_expert_tokens) + + +def ref_impl(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, + num_expert_tokens: torch.Tensor) -> torch.Tensor: + + num_expert_tokens_cpu = num_expert_tokens.clone() + num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu") + num_experts = num_expert_tokens.size(0) + + for e in range(num_experts): + num_tokens = num_expert_tokens_cpu[e] + C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1) + + return C + + +@pytest.mark.parametrize("num_experts", [16, 32]) +@pytest.mark.parametrize("max_tokens_per_expert", + [32, 64, 128, 192, 224, 256, 512]) +@pytest.mark.parametrize("K", [128, 256, 1024]) +@pytest.mark.parametrize("N", [128, 256, 512, 1024]) +@pytest.mark.parametrize("dtype", + [torch.float32, torch.float16, torch.bfloat16]) +def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int, + N: int, dtype: torch.dtype): + + config = BatchedMMConfig(dtype, num_experts, max_tokens_per_expert, K, N) + tensors = BatchedMMTensors.make_tensors(config) + + test_output = tensors.C + ref_output = test_output.clone() + + compute_tl_dtype = { + torch.float16: tl.float16, + torch.bfloat16: tl.bfloat16, + torch.float32: tl.float32 + }[test_output.dtype] + invoke_moe_batched_triton_kernel( + tensors.A, + tensors.B, + test_output, + tensors.num_expert_tokens, + compute_tl_dtype, + # Quantization data + None, + None, + None, + # Quantization schemes + False, + False, + False, + config={ + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 16 + }) + + ref_output = ref_impl(tensors.A, tensors.B, ref_output, + tensors.num_expert_tokens) + + rtol, atol = { + torch.float16: (6e-2, 6e-2), + torch.bfloat16: (6e-2, 6e-2), + torch.float32: (1e-2, 1e-2), + }[test_output.dtype] + + torch.testing.assert_close(test_output, ref_output, atol=atol, rtol=rtol) diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 975cd418a171..7db4fe0f46e3 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -30,6 +30,11 @@ (224, 3072, 1536), ] +vllm_config = VllmConfig(parallel_config=ParallelConfig( + pipeline_parallel_size=1)) +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + @dataclasses.dataclass class MOETensors: @@ -190,7 +195,7 @@ def run_8_bit(moe_tensors: MOETensors8Bit, 'w1_q': moe_tensors.w1_q.transpose(1, 2), # type: ignore[union-attr] 'w2_q': moe_tensors.w2_q.transpose(1, 2), # type: ignore[union-attr] 'topk_weights': topk_weights, - 'topk_ids_': topk_ids, + 'topk_ids': topk_ids, 'ab_strides1': moe_tensors.ab_strides1, 'c_strides1': moe_tensors.c_strides1, 'ab_strides2': moe_tensors.ab_strides2, @@ -231,18 +236,15 @@ def test_cutlass_moe_8_bit_no_graph( per_out_ch: bool, ): current_platform.seed_everything(7) - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - + with set_current_vllm_config(vllm_config): mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch) score = torch.randn((m, e), device="cuda", dtype=torch.half) - topk_weights, topk_ids = fused_topk(mt.a, - score, - topk, - renormalize=False) + topk_weights, topk_ids, _ = fused_topk(mt.a, + score, + topk, + renormalize=False) # Note that we are using the dequantized versions of the tensors. # Using a, w1 and w2 directly results in minor output differences. @@ -276,20 +278,17 @@ def test_cutlass_moe_8_bit_cuda_graph( per_out_ch: bool, ): current_platform.seed_everything(7) - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - + with set_current_vllm_config(vllm_config): dtype = torch.half mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch) score = torch.randn((m, e), device="cuda", dtype=dtype) - topk_weights, topk_ids = fused_topk(mt.a, - score, - topk, - renormalize=False) + topk_weights, topk_ids, _ = fused_topk(mt.a, + score, + topk, + renormalize=False) # Note that we are using the dequantized versions of the tensors. # Using a, w1 and w2 directly results in minor output differences. @@ -334,18 +333,15 @@ def test_cutlass_moe_8_bit_EP( ep_size: int, ): current_platform.seed_everything(7) - with set_current_vllm_config( - VllmConfig(parallel_config=ParallelConfig( - pipeline_parallel_size=1))): - + with set_current_vllm_config(vllm_config): mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_channel) score = torch.randn((m, e), device="cuda", dtype=torch.half) - topk_weights, topk_ids = fused_topk(mt.a, - score, - topk, - renormalize=False) + topk_weights, topk_ids, _ = fused_topk(mt.a, + score, + topk, + renormalize=False) # Note that we are using the dequantized versions of the tensors. # Using a, w1 and w2 directly results in minor output differences. diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index 96b090136e3c..43ddc79fcb81 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -12,6 +12,7 @@ import vllm.model_executor.layers.fused_moe # noqa from tests.kernels.utils import opcheck, stack_and_dev, torch_moe +from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.moe_torch_iterative import ( @@ -32,6 +33,10 @@ EP_SIZE = [1, 4] TOP_KS = [2, 6] +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + @pytest.mark.parametrize("m", [1, 33, 64, 222, 1024 * 128]) @pytest.mark.parametrize("n", [128, 1024, 2048]) @@ -70,31 +75,33 @@ def test_fused_moe( else: e_map = None - torch_output = torch_moe(a, w1, w2, score, topk, e_map) - iterative_output = iterative_moe(a, - w1, - w2, - score, - topk, - global_num_experts=e, - expert_map=e_map, - renormalize=False) + with set_current_vllm_config(vllm_config): + torch_output = torch_moe(a, w1, w2, score, topk, e_map) + iterative_output = iterative_moe(a, + w1, + w2, + score, + topk, + global_num_experts=e, + expert_map=e_map, + renormalize=False) + + # Pad the weight if moe padding is enabled + if padding: + w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128] + torch.cuda.empty_cache() + w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128] + torch.cuda.empty_cache() + + triton_output = fused_moe(a, + w1, + w2, + score, + topk, + global_num_experts=e, + expert_map=e_map, + renormalize=False) - # Pad the weight if moe padding is enabled - if padding: - w1 = F.pad(w1, (0, 128), "constant", 0)[..., 0:-128] - torch.cuda.empty_cache() - w2 = F.pad(w2, (0, 128), "constant", 0)[..., 0:-128] - torch.cuda.empty_cache() - - triton_output = fused_moe(a, - w1, - w2, - score, - topk, - global_num_experts=e, - expert_map=e_map, - renormalize=False) torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) torch.testing.assert_close(iterative_output, torch_output, @@ -115,7 +122,6 @@ def test_fused_moe( def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, ep_size: int, dtype: torch.dtype, group_size: int, has_zp: bool, weight_bits: int): - print(m, n, k, e, topk, dtype, group_size, has_zp, weight_bits) a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 @@ -194,22 +200,24 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int, else: e_map = None - triton_output = fused_moe(a, - w1_qweight, - w2_qweight, - score, - topk, - renormalize=False, - use_int4_w4a16=weight_bits == 4, - use_int8_w8a16=weight_bits == 8, - global_num_experts=e, - expert_map=e_map, - w1_scale=w1_scales, - w2_scale=w2_scales, - w1_zp=w1_qzeros if has_zp else None, - w2_zp=w2_qzeros if has_zp else None, - block_shape=[0, group_size]) - torch_output = torch_moe(a, w1_ref, w2_ref, score, topk, e_map) + with set_current_vllm_config(vllm_config): + triton_output = fused_moe(a, + w1_qweight, + w2_qweight, + score, + topk, + renormalize=False, + use_int4_w4a16=weight_bits == 4, + use_int8_w8a16=weight_bits == 8, + global_num_experts=e, + expert_map=e_map, + w1_scale=w1_scales, + w2_scale=w2_scales, + w1_zp=w1_qzeros if has_zp else None, + w2_zp=w2_qzeros if has_zp else None, + block_shape=[0, group_size]) + torch_output = torch_moe(a, w1_ref, w2_ref, score, topk, e_map) + torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0) @@ -515,7 +523,8 @@ def test_fused_marlin_moe( topk_weights, topk_ids, _ = fused_topk(a, score, topk, False) - torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map) + with set_current_vllm_config(vllm_config): + torch_output = torch_moe(a, w_ref1, w_ref2, score, topk, e_map) marlin_output = torch.ops.vllm.fused_marlin_moe( a, diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py new file mode 100644 index 000000000000..8c4a2c3fa440 --- /dev/null +++ b/tests/kernels/moe/test_pplx_moe.py @@ -0,0 +1,691 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Tests for the MOE layers. + +Run `pytest tests/kernels/test_pplx_moe.py`. +""" +import dataclasses +import os +import traceback +from typing import Callable, Optional + +import pytest +import torch + +try: + from pplx_kernels import AllToAll + from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id, + nvshmem_finalize, nvshmem_get_unique_id, + nvshmem_init) + has_pplx = True +except ImportError: + has_pplx = False + +from torch.multiprocessing import ( + spawn) # pyright: ignore[reportPrivateImportUsage] +from typing_extensions import Concatenate, ParamSpec + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.fused_moe import override_config +from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( + BatchedExperts, BatchedPrepareAndFinalize, BatchedTritonExperts) +from vllm.model_executor.layers.fused_moe.fused_moe import (fused_topk, + get_default_config) +from vllm.model_executor.layers.fused_moe.modular_kernel import ( + FusedMoEModularKernel) +from vllm.platforms import current_platform + +PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512), + (222, 2048, 1024)] + +PPLX_MOE_COMBOS = [ + (1, 128, 128), + (2, 128, 512), + (3, 1024, 2048), + (32, 128, 1024), + (45, 512, 2048), + (64, 1024, 1024), + (222, 1024, 2048), +] + +NUM_EXPERTS = [8, 64] +EP_SIZE = [1, 4] +TOP_KS = [1, 2, 6] + +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + +P = ParamSpec("P") + +requires_pplx = pytest.mark.skipif( + not has_pplx, + reason="Requires PPLX kernels", +) + + +@dataclasses.dataclass +class ProcessGroupInfo: + world_size: int + world_local_size: int + rank: int + node_rank: int + local_rank: int + device: torch.device + + +def _worker_parallel_launch( + local_rank: int, + world_size: int, + world_local_size: int, + node_rank: int, + init_method: str, + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + rank = node_rank * world_local_size + local_rank + torch.cuda.set_device(local_rank) + device = torch.device("cuda", local_rank) + torch.distributed.init_process_group( + backend="cpu:gloo,cuda:nccl", + init_method=init_method, + rank=rank, + world_size=world_size, + device_id=device, + ) + barrier = torch.tensor([rank], device=device) + torch.distributed.all_reduce(barrier) + + try: + worker( + ProcessGroupInfo( + world_size=world_size, + world_local_size=world_local_size, + rank=rank, + node_rank=node_rank, + local_rank=local_rank, + device=device, + ), + *args, + **kwargs, + ) + except Exception as ex: + print(ex) + traceback.print_exc() + raise + finally: + torch.distributed.destroy_process_group() + + +def parallel_launch( + world_size: int, + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + assert not kwargs + spawn( + _worker_parallel_launch, + args=( + world_size, + world_size, + 0, + "tcp://localhost:29500", + worker, + ) + args, + nprocs=world_size, + join=True, + ) + + +def parallel_launch_from_env( + worker: Callable[Concatenate[ProcessGroupInfo, P], None], + *args: P.args, + **kwargs: P.kwargs, +) -> None: + """ + Launches a worker function in parallel across all processes in the current + environment. The environment must have the following variables set: + - WORLD_SIZE: The total number of processes. + - WORLD_LOCAL_SIZE: The number of processes on the current node. + - NODE_RANK: The rank of the current + - MASTER_ADDR: The address of the master process. + - MASTER_PORT: The port of the master process. + """ + assert not kwargs + world_size = int(os.environ["WORLD_SIZE"]) + world_local_size = int(os.environ["WORLD_LOCAL_SIZE"]) + node_rank = int(os.environ["NODE_RANK"]) + assert "MASTER_ADDR" in os.environ + assert "MASTER_PORT" in os.environ + spawn( + _worker_parallel_launch, + args=( + world_size, + world_local_size, + node_rank, + "env://", + worker, + ) + args, + nprocs=world_local_size, + join=True, + ) + + +def torch_prepare( + a: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + max_num_tokens: Optional[int] = None, +) -> tuple[torch.Tensor, torch.Tensor]: + assert topk_ids.dim() == 2 + assert topk_ids.shape[0] == a.shape[0] + + num_tokens, hidden_dim = a.shape + topk = topk_ids.shape[1] + + tokens_per_expert = torch.bincount(topk_ids.view(-1), + minlength=num_experts) + + assert tokens_per_expert.numel() == num_experts + + if max_num_tokens is None: + max_num_tokens = int(tokens_per_expert.max().item()) + + b_a = torch.zeros((num_experts, max_num_tokens, hidden_dim), + dtype=a.dtype, + device=a.device) + + token_counts = torch.zeros(num_experts, dtype=torch.int, device=a.device) + + for token in range(num_tokens): + for j in range(topk): + expert_id = topk_ids[token, j] + idx = token_counts[expert_id] + b_a[expert_id, idx:idx + 1, :] = a[token, :] + token_counts[expert_id] = token_counts[expert_id] + 1 + + return b_a, tokens_per_expert + + +def torch_finalize(b_out: torch.Tensor, topk_weight: torch.Tensor, + topk_ids: torch.Tensor) -> torch.Tensor: + num_tokens = topk_ids.shape[0] + num_experts = b_out.shape[0] + K = b_out.shape[-1] + out = torch.zeros((num_tokens, K), dtype=b_out.dtype, device=b_out.device) + expert_counts = torch.zeros(num_experts, + dtype=torch.int, + device=b_out.device) + for token in range(num_tokens): + expert_ids = topk_ids[token] + for i in range(expert_ids.numel()): + expert_id = expert_ids[i] + idx = expert_counts[expert_id] + out[token, :] = out[token, :] + b_out[expert_id, idx:idx + + 1, :] * topk_weight[token, i] + expert_counts[expert_id] = expert_counts[expert_id] + 1 + + return out + + +def torch_batched_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, +) -> torch.Tensor: + num_experts = w1.shape[0] + b_a, tokens_per_expert = torch_prepare(a, topk_ids, num_experts) + assert b_a.dim() == 3 + num_tokens, topk = topk_ids.shape + _, max_num_tokens, K = b_a.shape + assert num_experts == b_a.shape[0] and w2.shape[1] == K + out = torch.zeros((num_experts, max_num_tokens, K), + dtype=b_a.dtype, + device=b_a.device) + tmp = torch.empty((max_num_tokens, w1.shape[1] // 2), + dtype=b_a.dtype, + device=b_a.device) + for expert in range(num_experts): + num = tokens_per_expert[expert] + if num > 0: + torch.ops._C.silu_and_mul( + tmp[:num], b_a[expert, :num, :] @ w1[expert].transpose(0, 1)) + out[expert, :num, :] = tmp[:num] @ w2[expert].transpose(0, 1) + + return torch_finalize(out, topk_weight, topk_ids) + + +def batched_moe( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, +) -> torch.Tensor: + num_experts = w1.shape[0] + + fused_experts = FusedMoEModularKernel( + BatchedPrepareAndFinalize(a.shape[0], world_size=1, dp_size=1, rank=0), + BatchedExperts(max_num_tokens=a.shape[0], dp_size=1, world_size=1)) + + return fused_experts(a, w1, w2, topk_weight, topk_ids, num_experts) + + +# Note: same as torch_moe but with fused_topk factored out. +def torch_moe2( + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, +) -> torch.Tensor: + M, K = a.shape + topk = topk_ids.shape[1] + a = a.view(M, -1, K).repeat(1, topk, 1).reshape(-1, K) + out = torch.zeros(M * topk, w2.shape[1], dtype=a.dtype, device=a.device) + num_experts = w1.shape[0] + for i in range(num_experts): + mask = (topk_ids == i).view(-1) + if mask.sum(): + out[mask] = SiluAndMul()( + a[mask] @ w1[i].transpose(0, 1)) @ w2[i].transpose(0, 1) + + return (out.view(M, -1, w2.shape[1]) * + topk_weight.view(M, -1, 1).to(out.dtype)).sum(dim=1) + + +@pytest.mark.parametrize("m", [1, 33, 64, 222]) +@pytest.mark.parametrize("n", [128, 1024, 2048]) +@pytest.mark.parametrize("k", [128, 512, 1024]) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +def test_fused_moe_batched_experts( + m: int, + n: int, + k: int, + e: int, + topk: int, + dtype: torch.dtype, +): + current_platform.seed_everything(7) + + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + score = torch.randn((m, e), device="cuda", dtype=dtype) + + with set_current_vllm_config(vllm_config): + topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) + baseline_output = torch_moe2(a, w1, w2, topk_weight, topk_ids) + torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids) + batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids) + + torch.testing.assert_close(baseline_output, + torch_output, + atol=2e-2, + rtol=0) + torch.testing.assert_close(baseline_output, + batched_output, + atol=2e-2, + rtol=0) + + +def rank_chunk(num: int, r: int, w: int) -> int: + rem = num % w + return (num // w) + (1 if r < rem else 0) + + +def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor: + chunk = rank_chunk(t.shape[0], r, w) + return t[(r * chunk):(r + 1) * chunk] + + +def pplx_prepare_finalize(pgi: ProcessGroupInfo, dp_size: int, a: torch.Tensor, + topk_weight: torch.Tensor, topk_ids: torch.Tensor, + num_experts: int) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( + PplxPrepareAndFinalize) + + assert torch.cuda.current_device() == pgi.local_rank + + topk = topk_ids.shape[1] + num_tokens, hidden_dim = a.shape + block_size = 128 + device = pgi.device + rank = pgi.rank + world_size = pgi.world_size + max_num_tokens = rank_chunk(num_tokens, 0, world_size) + + ata = AllToAll.internode( + max_num_tokens=max_num_tokens, + num_experts=num_experts, + experts_per_token=topk, + rank=rank, + world_size=world_size, + dp_size=dp_size, + hidden_dim=hidden_dim, + hidden_dim_bytes=hidden_dim * a.dtype.itemsize, + hidden_dim_scale_bytes=(0 if a.dtype.itemsize != 1 else + ((hidden_dim + block_size - 1) // block_size * + torch.float32.itemsize)), + ) + + topk_ids = topk_ids.to(dtype=torch.uint32) + + prepare_finalize = PplxPrepareAndFinalize( + ata, + max_num_tokens, + world_size, + rank, + dp_size, + a.dtype, + ) + + a_chunk = chunk_by_rank(a, rank, world_size).to(device) + chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) + chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) + + b_a, b_a_scale, expert_num_tokens = prepare_finalize.prepare( + a_chunk, + None, + None, + chunk_topk_weight, + chunk_topk_ids, + num_experts, + None, + False, + ) + + b_a = b_a * 1.5 + + out = torch.full( + (max_num_tokens, hidden_dim), + torch.nan, + dtype=a.dtype, + device=device, + ) + + prepare_finalize.finalize( + out, + b_a, + chunk_topk_weight, + chunk_topk_ids, + False, + ) + + torch.cuda.synchronize() + + ata.destroy() + + num_tokens = a_chunk.shape[0] + + return out[:num_tokens] + + +def _pplx_prepare_finalize( + pgi: ProcessGroupInfo, + dp_size: int, + a: torch.Tensor, + score: torch.Tensor, + topk: torch.Tensor, + num_experts: int, +): + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + device = pgi.device + + topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) + k = a.shape[1] + + a_rep = torch.repeat_interleave(a, topk, dim=0).to(device) + + torch_output = (a_rep.view(-1, topk, k) * 1.5 * + topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to( + a.dtype) + + pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids, + num_experts) + + torch_output = chunk_by_rank(torch_output, pgi.rank, + pgi.world_size).to(pplx_output.device) + + torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0) + + nvshmem_finalize() + + +# TODO (bnell): this test point does not work for odd M due to how the test is +# written, not due to limitations of the pplx kernels. The pplx_moe +# test below is able to deal with odd M. +@pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("world_dp_size", [[2, 1]]) +@requires_pplx +def test_pplx_prepare_finalize( + mnk: tuple[int, int, int], + e: int, + topk: int, + dtype: torch.dtype, + world_dp_size: tuple[int, int], +): + current_platform.seed_everything(7) + m, n, k = mnk + world_size, dp_size = world_dp_size + device = "cuda" + a = torch.randn((m, k), device=device, dtype=dtype) / 10 + score = torch.randn((m, e), device=device, dtype=dtype) + + parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, score, + topk, e) + + +def pplx_moe( + rank: int, + world_size: int, + dp_size: int, + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weight: torch.Tensor, + topk_ids: torch.Tensor, + use_compile: bool = True, + use_cudagraphs: bool = True, +) -> torch.Tensor: + from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import ( + PplxPrepareAndFinalize) + + device = torch.device("cuda", rank) + hidden_dim = a.shape[1] + num_experts = w1.shape[0] + block_size = 128 + topk = topk_ids.shape[1] + max_num_tokens = rank_chunk(a.shape[0], 0, world_size) + + ata = AllToAll.internode( + max_num_tokens=max_num_tokens, + num_experts=num_experts, + experts_per_token=topk, + rank=rank, + world_size=world_size, + dp_size=dp_size, + hidden_dim=hidden_dim, + hidden_dim_bytes=hidden_dim * a.dtype.itemsize, + hidden_dim_scale_bytes=(0 if a.dtype.itemsize != 1 else + ((hidden_dim + block_size - 1) // block_size * + torch.float32.itemsize)), + ) + + topk_ids = topk_ids.to(dtype=torch.uint32) + + prepare_finalize = PplxPrepareAndFinalize( + ata, + max_num_tokens, + world_size, + rank, + dp_size, + ) + + experts = BatchedTritonExperts(max_num_tokens=a.shape[0], + world_size=world_size, + dp_size=dp_size) + + fused_experts = FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + # Note: workers with the same dp_rank must use the exact same inputs. + a_chunk = chunk_by_rank(a, rank, world_size).to(device) + chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) + chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) + + # Chunking weights like this only works for batched format + w1_chunk = chunk_by_rank(w1, rank, world_size).to(device) + w2_chunk = chunk_by_rank(w2, rank, world_size).to(device) + + if use_compile: + _fused_experts = torch.compile(fused_experts, + backend='inductor', + fullgraph=True) + else: + _fused_experts = fused_experts + + out = _fused_experts(a_chunk, + w1_chunk, + w2_chunk, + chunk_topk_weight, + chunk_topk_ids, + global_num_experts=num_experts) + + if use_cudagraphs: + out.fill_(0) + stream = torch.cuda.Stream() + graph = torch.cuda.CUDAGraph() + with torch.cuda.graph(graph, stream=stream): + out = _fused_experts(a_chunk, + w1_chunk, + w2_chunk, + chunk_topk_weight, + chunk_topk_ids, + global_num_experts=num_experts) + + torch.cuda.synchronize() + graph.replay() + + torch.cuda.synchronize() + + ata.destroy() + + return out + + +def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids): + assert torch.cuda.current_device() == pgi.local_rank + + num_experts = w1.shape[0] + device = pgi.device + rank = pgi.rank + world_size = pgi.world_size + max_num_tokens = rank_chunk(a.shape[0], 0, world_size) + + prepare_finalize = BatchedPrepareAndFinalize( + max_num_tokens=max_num_tokens, + world_size=world_size, + dp_size=dp_size, + rank=rank, + ) + + experts = BatchedExperts(max_num_tokens=a.shape[0], + world_size=1, + dp_size=1) + + fused_experts = FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + # Note: workers with the same dp_rank must use the exact same inputs. + a_chunk = chunk_by_rank(a, rank, world_size).to(device) + chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device) + chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device) + + out = fused_experts( + a_chunk, + # Chunking weights like this only works for batched format + chunk_by_rank(w1, rank, world_size).to(device), + chunk_by_rank(w2, rank, world_size).to(device), + chunk_topk_weight, + chunk_topk_ids, + global_num_experts=num_experts) + + return out + + +def _pplx_moe( + pgi: ProcessGroupInfo, + dp_size: int, + a: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + score: torch.Tensor, + topk: int, +): + uid = nvshmem_get_unique_id( + ) if pgi.rank == 0 else nvshmem_alloc_empty_unique_id() + torch.distributed.broadcast(uid, src=0) + nvshmem_init(uid, pgi.rank, pgi.world_size) + + m, k = a.shape + e, _, n = w2.shape + + moe_config = get_default_config(m, e, n, k, topk, a.dtype, False) + + with set_current_vllm_config(vllm_config), override_config(moe_config): + topk_weight, topk_ids, _ = fused_topk(a, score, topk, False) + torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids) + pplx_output = pplx_moe(pgi.rank, pgi.world_size, dp_size, a, w1, w2, + topk_weight, topk_ids) + # TODO (bnell): fix + re-enable + #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, + # topk_ids) + + torch_output = chunk_by_rank(torch_output, pgi.rank, + pgi.world_size).to(pplx_output.device) + + torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0) + #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0) + + nvshmem_finalize() + + +@pytest.mark.parametrize("mnk", PPLX_MOE_COMBOS) +@pytest.mark.parametrize("e", NUM_EXPERTS) +@pytest.mark.parametrize("topk", TOP_KS) +@pytest.mark.parametrize("dtype", [torch.bfloat16]) +@pytest.mark.parametrize("world_dp_size", [[2, 1]]) +@requires_pplx +def test_pplx_moe( + mnk: tuple[int, int, int], + e: int, + topk: int, + dtype: torch.dtype, + world_dp_size: tuple[int, int], +): + current_platform.seed_everything(7) + m, n, k = mnk + world_size, dp_size = world_dp_size + a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 + w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 + w2 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 10 + score = torch.randn((m, e), device="cuda", dtype=dtype) + + parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk) diff --git a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py index 44734e9340aa..3b5838a99fa1 100644 --- a/tests/kernels/moe/test_triton_moe_ptpc_fp8.py +++ b/tests/kernels/moe/test_triton_moe_ptpc_fp8.py @@ -7,6 +7,7 @@ import torch from vllm import _custom_ops as ops +from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.platforms import current_platform @@ -15,6 +16,10 @@ pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + def native_w8a8_per_token_matmul(A, B, As, Bs, output_dtype=torch.float16): """Matrix multiplication function that supports per-token input @@ -137,20 +142,21 @@ def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed): w2_s = torch.rand(E, K, device=w2_fp32.device) * factor_for_scale score = torch.randn((M, E), dtype=dtype) - ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk) - out = fused_moe( - a, - w1, - w2, - score, - topk, - renormalize=False, - use_fp8_w8a8=True, # using fp8 - per_channel_quant=True, - w1_scale=w1_s, - w2_scale=w2_s, - block_shape=None, # Not using block quantization - ) + with set_current_vllm_config(vllm_config): + ref_out = torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk) + out = fused_moe( + a, + w1, + w2, + score, + topk, + renormalize=False, + use_fp8_w8a8=True, # using fp8 + per_channel_quant=True, + w1_scale=w1_s, + w2_scale=w2_s, + block_shape=None, # Not using block quantization + ) # Check results rel_diff = (torch.mean( diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index 38c7e461bb9c..ef1d7e47ef81 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -11,7 +11,7 @@ from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import fused_moe from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( - deep_gemm_moe_fp8) + _valid_deep_gemm_shape, deep_gemm_moe_fp8) from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( moe_align_block_size) @@ -30,6 +30,10 @@ pytest.skip("FP8 Triton requires CUDA 9.0 or higher", allow_module_level=True) +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + # Test configurations DTYPES = [torch.bfloat16] # [torch.half, torch.bfloat16, torch.float32] NUM_TOKENS = [7, 83, 2048] @@ -210,7 +214,6 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): score = torch.randn((M, E), dtype=dtype) # Set the context to avoid lots of warning spam. - vllm_config = VllmConfig() with set_current_vllm_config(vllm_config): out = fused_moe( a, @@ -258,6 +261,7 @@ def per_block_cast_to_fp8( @pytest.mark.parametrize( "M,N,K,block_size,out_dtype,seed", itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS)) +@pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.") @torch.inference_mode() def test_w8a8_block_fp8_deep_gemm_matmul(M, N, K, block_size, out_dtype, seed): # only aligned sizes @@ -381,15 +385,11 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed): block_size = [block_m, block_m] dtype = torch.bfloat16 - # only aligned sizes - if (N % block_m != 0 or K % block_m != 0 or topk > E): - pytest.skip( - f"Skipping test; bad size m={M}, n={N}, k={K}, topk={topk}, E={E}") - - if N <= 512: - pytest.skip("Skipping N <= 512 until performance issues solved.") + if topk > E: + pytest.skip(f"Skipping test: topk={topk} > E={E}") - vllm_config = VllmConfig() + if not _valid_deep_gemm_shape(M, N, K): + pytest.skip(f"Skipping test: invalid size m={M}, n={N}, k={K}") torch.manual_seed(seed) fp8_info = torch.finfo(torch.float8_e4m3fn) diff --git a/tests/kernels/quantization/test_block_int8.py b/tests/kernels/quantization/test_block_int8.py index 104f23fd7cd2..a4e9f83f0eaf 100644 --- a/tests/kernels/quantization/test_block_int8.py +++ b/tests/kernels/quantization/test_block_int8.py @@ -18,6 +18,10 @@ pytest.skip("INT8 Triton requires CUDA 7.0 or higher", allow_module_level=True) +vllm_config = VllmConfig() +vllm_config.scheduler_config.max_num_seqs = 128 +vllm_config.scheduler_config.max_model_len = 8192 + # For test def native_per_token_group_quant_int8(x, @@ -174,7 +178,6 @@ def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed): score = torch.randn((M, E), dtype=dtype) # Set the context to avoid lots of warning spam. - vllm_config = VllmConfig() with set_current_vllm_config(vllm_config): out = fused_moe( a, diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 2041a54e8c0d..51c519d8f862 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -23,6 +23,7 @@ """ import contextlib import gc +import importlib.util import pickle import weakref from collections import namedtuple @@ -42,7 +43,7 @@ from vllm.distributed.utils import StatelessProcessGroup from vllm.logger import init_logger from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname, - supports_custom_op) + run_once, supports_custom_op) @dataclass @@ -936,9 +937,49 @@ def init_distributed_environment( "world group already initialized with a different world size") +PPLX_DID_INIT: bool = False + + +@run_once +def pplx_init(rank, world_size): + has_pplx = importlib.util.find_spec("pplx_kernels") is not None + + if has_pplx and world_size > 1: + from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id, + nvshmem_get_unique_id, nvshmem_init) + try: + global PPLX_DID_INIT + logger.debug( + "Initialize NVSHMEM for PPLX kernels: rank=%d, " + "world size=%d", rank, world_size) + uid = nvshmem_get_unique_id( + ) if rank == 0 else nvshmem_alloc_empty_unique_id() + uid_gpu = uid.cuda() + get_world_group().broadcast(uid_gpu, src=0) + uid = uid_gpu.to(device='cpu') + logger.debug("PPLX NVSHMEM UID = %s", uid) + nvshmem_init(uid, rank, world_size) + PPLX_DID_INIT = True + except Exception as ex: + logger.error("Failed to initialize NVSHMEM for PPLX: %s", ex) + + +@run_once +def pplx_finalize(): + global PPLX_DID_INIT + if PPLX_DID_INIT: + from pplx_kernels.nvshmem import nvshmem_finalize + logger.debug("PPLX NVSHMEM finalize") + from vllm.model_executor.layers.fused_moe.layer import ( + _all_to_all_cache) + _all_to_all_cache.destroy() + nvshmem_finalize() + + def initialize_model_parallel( tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1, + enable_expert_parallel: bool = False, backend: Optional[str] = None, ) -> None: """ @@ -1041,10 +1082,14 @@ def initialize_model_parallel( _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group, _EP.rank_in_group) + if enable_expert_parallel: + pplx_init(rank, world_size) + def ensure_model_parallel_initialized( tensor_model_parallel_size: int, pipeline_model_parallel_size: int, + enable_expert_parallel: bool = False, backend: Optional[str] = None, ) -> None: """Helper to initialize model parallel groups if they are not initialized, @@ -1055,7 +1100,8 @@ def ensure_model_parallel_initialized( get_world_group().device_group) if not model_parallel_is_initialized(): initialize_model_parallel(tensor_model_parallel_size, - pipeline_model_parallel_size, backend) + pipeline_model_parallel_size, + enable_expert_parallel, backend) return assert ( @@ -1133,6 +1179,9 @@ def get_tensor_model_parallel_rank(): def destroy_model_parallel(): """Set the groups to none and destroy them.""" global _TP + + pplx_finalize() + if _TP: _TP.destroy() _TP = None diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py index 68983b91b2be..6bb323d79d64 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -23,7 +23,7 @@ import vllm.envs as envs from vllm.logger import init_logger -from vllm.utils import get_tcp_uri +from vllm.utils import get_tcp_uri, is_torch_equal_or_newer logger = init_logger(__name__) @@ -362,12 +362,11 @@ def stateless_destroy_torch_distributed_process_group( Destroy ProcessGroup returned by stateless_init_torch_distributed_process_group(). """ - # Lazy import for non-CUDA backends. - try: - # pytorch <= 2.6 + if is_torch_equal_or_newer("2.7"): + pg.shutdown() + else: + # Lazy import for non-CUDA backends. from torch.distributed.distributed_c10d import _shutdown_backend _shutdown_backend(pg) - except ImportError: - # pytorch >= 2.7 - pg.shutdown() + _unregister_process_group(pg.group_name) diff --git a/vllm/forward_context.py b/vllm/forward_context.py index eb1e1f5694bb..5d2d95f18d2f 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -27,6 +27,7 @@ @dataclass class DPMetadata: + max_tokens_across_dp_cpu: torch.Tensor cu_tokens_across_dp_cpu: torch.Tensor @@ -90,8 +91,10 @@ def set_forward_context(attn_metadata: Any, dtype=torch.int32) from vllm.distributed.parallel_state import get_dp_group dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group) + max_tokens_across_dp_cpu = torch.max(num_tokens_tensor) cu_tokens_across_dp_cpu = torch.cumsum(num_tokens_tensor, dim=0) - dp_metadata = DPMetadata(cu_tokens_across_dp_cpu) + dp_metadata = DPMetadata(max_tokens_across_dp_cpu, + cu_tokens_across_dp_cpu) global _forward_context prev_context = _forward_context diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 08be9de62621..5c262287f7dd 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -38,8 +38,8 @@ def get_config() -> Optional[dict[str, Any]]: from vllm.model_executor.layers.fused_moe.cutlass_moe import ( cutlass_moe_fp4, cutlass_moe_fp8) from vllm.model_executor.layers.fused_moe.fused_moe import ( - fused_experts, fused_moe, fused_topk, get_config_file_name, - grouped_topk) + TritonExperts, fused_experts, fused_moe, fused_topk, + get_config_file_name, grouped_topk) __all__ += [ "fused_moe", @@ -49,4 +49,5 @@ def get_config() -> Optional[dict[str, Any]]: "grouped_topk", "cutlass_moe_fp8", "cutlass_moe_fp4", + "TritonExperts", ] diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 7f96a4012716..aff108112b61 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -5,10 +5,176 @@ import torch +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP) +from vllm.model_executor.layers.fused_moe.utils import _fp8_perm, _resize_cache from vllm.scalar_type import scalar_types +class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__( + self, + ab_strides1: torch.Tensor, + c_strides1: torch.Tensor, + ab_strides2: torch.Tensor, + c_strides2: torch.Tensor, + out_dtype: torch.dtype, + ): + super().__init__() + self.ab_strides1 = ab_strides1 + self.c_strides1 = c_strides1 + self.ab_strides2 = ab_strides2 + self.c_strides2 = c_strides2 + self.out_dtype = out_dtype + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + # Note that K, N are transposed + N, K = K, N + workspace1 = M * topk * max(2 * N, K) + workspace2 = M * topk * N + return (workspace1, workspace2, self.out_dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + a1q = hidden_states + + assert w1_scale is not None + assert w2_scale is not None + assert w1.dtype == torch.float8_e4m3fn + assert w2.dtype == torch.float8_e4m3fn + assert a1q.shape[1] == w1.shape[1], "Hidden size mismatch w1" + assert w1.shape[2] == w2.shape[1] * 2, "Hidden size mismatch w2" + assert w1.shape[0] == w2.shape[0], "Expert number mismatch" + assert a1q_scale is None or a1q_scale.dim( + ) == 0 or a1q_scale.shape[0] == 1 or a1q_scale.shape[0] == a1q.shape[ + 0], "Input scale shape mismatch" + assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[ + 1] == w1.shape[2], "W1 scale shape mismatch" + assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[ + 1] == w2.shape[2], "W2 scale shape mismatch" + assert w1.shape[0] == w2.shape[0], "Weights expert number mismatch" + assert w1.shape[0] == w1_scale.shape[ + 0], "w1 scales expert number mismatch" + assert w1.shape[0] == w2_scale.shape[ + 0], "w2 scales expert number mismatch" + assert a2_scale is None or a1q_scale is None or a2_scale.shape == a1q_scale.shape, "Intermediate scale shape mismatch" # noqa: E501 + assert self.ab_strides1.shape[0] == w1.shape[ + 0], "AB Strides 1 expert number mismatch" + assert self.c_strides1.shape[0] == w1.shape[ + 0], "C Strides 1 expert number mismatch" + assert self.ab_strides2.shape[0] == w2.shape[ + 0], "AB Strides 2 expert number mismatch" + assert self.c_strides2.shape[0] == w2.shape[ + 0], "C Strides 2 expert number mismatch" + assert self.out_dtype in [torch.half, + torch.bfloat16], "Invalid output dtype" + + M = a1q.shape[0] + _, N, K = w2.shape # because w1 + w2 are transposed + device = a1q.device + + assert w1.shape[1] == K + assert global_num_experts != -1 + assert a1q_scale is not None + + if expert_map is not None: + "Translate info from expert_map to topk_ids" + local_topk_ids = torch.where(expert_map[topk_ids] != -1, + expert_map[topk_ids], -1) + else: + local_topk_ids = topk_ids + + topk = local_topk_ids.shape[1] + + per_act_token = a1q_scale.numel() != 1 if a1q_scale is not None else ( + a2_scale.numel() != 1 if a2_scale is not None else False) + + expert_offsets = torch.empty((global_num_experts + 1), + dtype=torch.int32, + device=device) + problem_sizes1 = torch.empty((global_num_experts, 3), + dtype=torch.int32, + device=device) + problem_sizes2 = torch.empty((global_num_experts, 3), + dtype=torch.int32, + device=device) + + # With expert_map each Rank processes only a subset of experts. As + # a result not all of a_map and c2 tensors are filled. We fill it + # zeros for correctness. + if expert_map is not None: + a_map = torch.zeros((local_topk_ids.numel()), + dtype=torch.int32, + device=device) + else: + a_map = torch.empty((local_topk_ids.numel()), + dtype=torch.int32, + device=device) + + c_map = torch.empty((local_topk_ids.numel()), + dtype=torch.int32, + device=device) + + ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, + problem_sizes1, problem_sizes2, a_map, + c_map, global_num_experts, N, K) + + a1q = _fp8_perm(a1q, a_map) + a1q_scale = a1q_scale[a_map] if per_act_token else a1q_scale + + c1 = _resize_cache(workspace13, (M * topk, N * 2)) + c2 = _resize_cache(workspace2, (M * topk, N)) + c3 = _resize_cache(workspace13, (M * topk, K)) + + ops.cutlass_moe_mm(c1, a1q, w1, a1q_scale, w1_scale, + expert_offsets[:-1], problem_sizes1, + self.ab_strides1, self.ab_strides1, self.c_strides1) + + self.activation(activation, c2, c1) + + a2q, a2q_scale = ops.scaled_fp8_quant( + c2, a2_scale, use_per_token_if_dynamic=per_act_token) + + if expert_map is not None: + c3.fill_(0) + + ops.cutlass_moe_mm(c3, a2q, w2, a2q_scale, w2_scale, + expert_offsets[:-1], problem_sizes2, + self.ab_strides2, self.ab_strides2, self.c_strides2) + + c3 = c3[c_map] + + return c3 + + #TODO make the grouped gemm kernel consistent with scaled gemm kernel def cutlass_moe_fp8( a: torch.Tensor, @@ -17,7 +183,7 @@ def cutlass_moe_fp8( w1_scale: torch.Tensor, w2_scale: torch.Tensor, topk_weights: torch.Tensor, - topk_ids_: torch.Tensor, + topk_ids: torch.Tensor, ab_strides1: torch.Tensor, c_strides1: torch.Tensor, ab_strides2: torch.Tensor, @@ -59,7 +225,7 @@ def cutlass_moe_fp8( - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize the intermediate result between the gemms. Shape: scalar or [M] - - out_dtype (torch.Tensor): The output tensor type. + - out_dtype (torch.dtype): The output tensor type. - expert_map (Optional[torch.Tensor]): In the case of Expert parallel, every Rank is responsible for a subset of experts. expert_map is a mapping from global expert-id to local expert-id. When expert_map[i] @@ -71,115 +237,36 @@ def cutlass_moe_fp8( Returns: - torch.Tensor: The fp16 output tensor after applying the MoE layer. """ - - assert topk_weights.shape == topk_ids_.shape, "topk shape mismatch" - assert w1_q.dtype == torch.float8_e4m3fn - assert w2_q.dtype == torch.float8_e4m3fn - assert a.shape[1] == w1_q.shape[1], "Hidden size mismatch w1" - assert w1_q.shape[2] == w2_q.shape[1] * 2, "Hidden size mismatch w2" - assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch" - assert a1_scale is None or a1_scale.dim( - ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[0] == a.shape[ - 0], "Input scale shape mismatch" - assert w1_scale.dim() == 1 or w1_scale.shape[1] == 1 or w1_scale.shape[ - 1] == w1_q.shape[2], "W1 scale shape mismatch" - assert w2_scale.dim() == 1 or w2_scale.shape[1] == 1 or w2_scale.shape[ - 1] == w2_q.shape[2], "W2 scale shape mismatch" - assert w1_q.shape[0] == w2_q.shape[0], "Weights expert number mismatch" - assert w1_q.shape[0] == w1_scale.shape[ - 0], "w1 scales expert number mismatch" - assert w1_q.shape[0] == w2_scale.shape[ - 0], "w2 scales expert number mismatch" - assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch" # noqa: E501 - assert ab_strides1.shape[0] == w1_q.shape[ - 0], "AB Strides 1 expert number mismatch" - assert c_strides1.shape[0] == w1_q.shape[ - 0], "C Strides 1 expert number mismatch" - assert ab_strides2.shape[0] == w2_q.shape[ - 0], "AB Strides 2 expert number mismatch" - assert c_strides2.shape[0] == w2_q.shape[ - 0], "C Strides 2 expert number mismatch" - assert out_dtype in [torch.half, torch.bfloat16], "Invalid output dtype" - - num_experts = w1_q.size(0) - m = a.size(0) - k = w1_q.size(1) - n = w2_q.size(1) - - local_topk_ids = topk_ids_ - if expert_map is not None: - "Translate info from expert_map to topk_ids" - local_topk_ids = torch.where(expert_map[topk_ids_] != -1, - expert_map[topk_ids_], -1) - - topk = local_topk_ids.size(1) - per_act_token = a1_scale.numel() != 1 if a1_scale is not None else ( a2_scale.numel() != 1 if a2_scale is not None else False) - if apply_router_weight_on_input: - assert topk == 1, \ - "apply_router_weight_on_input is only implemented for topk=1" - # TODO: this only works for topK=1, will need to update for topK>1 - a = a * topk_weights.to(out_dtype) - - a_q, a1_scale = ops.scaled_fp8_quant( - a, a1_scale, use_per_token_if_dynamic=per_act_token) - device = a_q.device - - expert_offsets = torch.empty((num_experts + 1), - dtype=torch.int32, - device=device) - problem_sizes1 = torch.empty((num_experts, 3), - dtype=torch.int32, - device=device) - problem_sizes2 = torch.empty((num_experts, 3), - dtype=torch.int32, - device=device) - - a_map_initializer = torch.empty - c2_initializer = torch.empty - if expert_map is not None: - # With expert_map each Rank processes only a subset of experts. As - # a result not all of a_map and c2 tensors are filled. We fill it - # zeros for correctness. - a_map_initializer = torch.zeros - c2_initializer = torch.zeros - - a_map = a_map_initializer((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - c_map = torch.empty((local_topk_ids.numel()), - dtype=torch.int32, - device=device) - ops.get_cutlass_moe_mm_data(local_topk_ids, expert_offsets, problem_sizes1, - problem_sizes2, a_map, c_map, num_experts, n, - k) - - rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype) - rep_a1_scales = a1_scale[a_map] if per_act_token else a1_scale - - c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype) - c2 = c2_initializer((m * topk, k), device=device, dtype=out_dtype) - - ops.cutlass_moe_mm(c1, rep_a_q, w1_q, rep_a1_scales, w1_scale, - expert_offsets[:-1], problem_sizes1, ab_strides1, - ab_strides1, c_strides1) - - intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype) - torch.ops._C.silu_and_mul(intermediate, c1) - - intemediate_q, a2_scale = ops.scaled_fp8_quant( - intermediate, a2_scale, use_per_token_if_dynamic=per_act_token) - - ops.cutlass_moe_mm(c2, intemediate_q, w2_q, a2_scale, w2_scale, - expert_offsets[:-1], problem_sizes2, ab_strides2, - ab_strides2, c_strides2) - # Gather tokens - c2 = c2[c_map].view(m, topk, k) - if not apply_router_weight_on_input: - c2 = c2 * topk_weights.view(m, topk, 1).to(out_dtype) - return c2.sum(dim=1) + fn = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP( + per_channel_quant=per_act_token, + quant_dtype=torch.float8_e4m3fn, + ), + CutlassExpertsFp8( + ab_strides1, + c_strides1, + ab_strides2, + c_strides2, + out_dtype, + ), + ) + + return fn( + a, + w1_q, + w2_q, + topk_weights, + topk_ids, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + apply_router_weight_on_input=apply_router_weight_on_input, + ) FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max() diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py index 5098e15dc5a4..46a814e6ecc3 100644 --- a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py +++ b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py @@ -1,16 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 +import functools import importlib.util from typing import Optional import torch -import vllm.envs as envs -from vllm import _custom_ops as ops +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm.logger import init_logger -from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( - moe_align_block_size) -from vllm.model_executor.layers.fused_moe.utils import (_fp8_perm, - _fp8_quantize, +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( + _moe_permute) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP) +from vllm.model_executor.layers.fused_moe.utils import (_fp8_quantize, _resize_cache) from vllm.utils import round_up @@ -19,6 +20,19 @@ has_deep_gemm = importlib.util.find_spec("deep_gemm") is not None +@functools.cache +def deep_gemm_block_shape() -> list[int]: + # Lazy import to avoid CUDA initialization problems. + import deep_gemm as dg + block = dg.get_m_alignment_for_contiguous_layout() + return [block, block] + + +def _valid_deep_gemm_shape(M: int, N: int, K: int): + align = deep_gemm_block_shape()[0] + return align <= M and N % align == 0 and K % align == 0 + + def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -29,89 +43,112 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, aligned by `dg.get_m_alignment_for_contiguous_layout()`. """ if not has_deep_gemm: + logger.debug("DeepGemm disabled: deep_gemm not available.") return False - # Lazy import to avoid CUDA initialization problems. - import deep_gemm as dg - - # Expert maps not supported yet. if expert_map is not None: + logger.debug("DeepGemm disabled: expert map NYI.") return False - align = dg.get_m_alignment_for_contiguous_layout() - M = hidden_states.shape[0] - _, K, N = w2.shape - - # For now, disable DeepGemm for small N until better permute/unpermute - # ops are available. - if N <= 512: + M = hidden_states.size(0) + _, K, N = w2.size() + if not _valid_deep_gemm_shape(M, N, K): + logger.debug("DeepGemm disabled: unalinged problem size.") return False - if align > M or N % align != 0 or K % align != 0: + if (w1.dtype != torch.float8_e4m3fn or w2.dtype != torch.float8_e4m3fn): + logger.debug("DeepGemm disabled: invalid weight dtype(s).") return False - return (hidden_states.is_contiguous() and w1.is_contiguous() - and w2.is_contiguous()) - - -def _moe_permute( - curr_hidden_states: torch.Tensor, - a1q_scale: Optional[torch.Tensor], - curr_topk_ids: torch.Tensor, - global_num_experts: int, - expert_map: Optional[torch.Tensor], - block_m: int, -) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, - Optional[torch.Tensor]]: - """ - Determine the sorted_token_ids, expert_ids for the given problem size. - Permute the hidden states and scales according to `sorted_token_ids`. - """ - top_k_num = curr_topk_ids.shape[1] - - tokens_in_chunk, _ = curr_hidden_states.shape + if (not hidden_states.is_contiguous() or not w1.is_contiguous() + or not w2.is_contiguous()): + logger.debug( + "DeepGemm disabled: weights or activations not contiguous.") + return False - sorted_token_ids, expert_ids, num_tokens_post_padded = ( - moe_align_block_size(curr_topk_ids, - block_m, - global_num_experts, - expert_map, - pad_sorted_ids=True)) + return True + + +class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__(self): + super().__init__() + self.block_shape = deep_gemm_block_shape() + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + block_m = self.block_shape[0] + M_sum = (M * topk) + num_experts * (block_m - 1) + M_sum = round_up(M_sum, block_m) + workspace1 = M_sum * max(N * 2, K) + workspace2 = M_sum * N + return (workspace1, workspace2, a.dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + import deep_gemm as dg + + a1q = hidden_states + _, N, K = w1.size() + + assert global_num_experts != -1 + assert w2.size(1) == K + + a1q, a1q_scale, _, expert_ids, inv_perm = _moe_permute( + a1q, + a1q_scale, + topk_ids, + global_num_experts, + expert_map, + self.block_shape[0], + ) + + # Note: M_sum is different than the pre-permuted shape of a1q. + M_sum = a1q.size(0) + workspace1 = _resize_cache(workspace13, (M_sum, N)) + workspace2 = _resize_cache(workspace2, (M_sum, N // 2)) + workspace3 = _resize_cache(workspace13, (M_sum, K)) - inv_perm: Optional[torch.Tensor] = None + dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( + (a1q, a1q_scale), (w1, w1_scale), workspace1, expert_ids) - num_tokens = top_k_num * tokens_in_chunk - sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) - expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0) - inv_perm = torch.argsort(sorted_token_ids)[:num_tokens] + self.activation(activation, workspace2, workspace1.view(-1, N)) - # Permute according to sorted token ids. - curr_hidden_states = _fp8_perm(curr_hidden_states, - sorted_token_ids // top_k_num) + a2q_scale: Optional[torch.Tensor] = None - if a1q_scale is not None: - a1q_scale = a1q_scale[sorted_token_ids // top_k_num] + a2q, a2q_scale = _fp8_quantize(workspace2, a2_scale, False, + self.block_shape) - return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids, - inv_perm) + dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( + (a2q, a2q_scale), (w2, w2_scale), workspace3, expert_ids) + workspace3 = workspace3[inv_perm, ...] -def _moe_unpermute_and_reduce( - out: torch.Tensor, - curr_hidden: torch.Tensor, - inv_perm: Optional[torch.Tensor], - topk_weight: torch.Tensor, -) -> None: - """ - Unpermute the final result and apply topk_weights, then perform the final - reduction on the hidden states. - """ - M, topk = topk_weight.shape - K = curr_hidden.shape[1] - curr_hidden = curr_hidden[inv_perm, ...] - curr_hidden = curr_hidden.view(-1, topk, K) - curr_hidden.mul_(topk_weight.view(M, -1, 1)) - ops.moe_sum(curr_hidden, out) + return workspace3 def deep_gemm_moe_fp8( @@ -128,6 +165,7 @@ def deep_gemm_moe_fp8( expert_map: Optional[torch.Tensor] = None, a1_scale: Optional[torch.Tensor] = None, a2_scale: Optional[torch.Tensor] = None, + apply_router_weight_on_input=False, ) -> torch.Tensor: """ This function computes a a8w8-quantized Mixture of Experts (MoE) layer @@ -166,129 +204,24 @@ def deep_gemm_moe_fp8( Returns: - torch.Tensor: The bfloat16 output tensor after applying the MoE layer. """ - # Lazy import to avoid CUDA initialization problems. - import deep_gemm as dg - - assert expert_map is None, "Expert maps not supported yet" - - assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" - - assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" - assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" - assert w1.stride(-1) == 1, "Stride of last dimension must be 1" - assert w2.stride(-1) == 1, "Stride of last dimension must be 1" - assert hidden_states.dtype in [ - torch.float32, torch.float16, torch.bfloat16 - ] - assert w1.dtype == torch.float8_e4m3fn - assert w2.dtype == torch.float8_e4m3fn - assert w1.shape[0] == w2.shape[0], "Expert number mismatch" - assert w1.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch" - assert w1.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch" - assert a1_scale is None or a1_scale.dim( - ) == 0 or a1_scale.shape[0] == 1 or a1_scale.shape[ - 0] == hidden_states.shape[0], "Input scale shape mismatch" - assert a2_scale is None or a1_scale is None or a2_scale.shape == a1_scale.shape, "Intermediate scale shape mismatch" # noqa: E501 - - num_tokens, _ = hidden_states.shape - E, N, _ = w1.shape - K = w2.shape[1] - if global_num_experts == -1: - global_num_experts = E - - # We execute the fused_moe kernel in chunks to circumvent this issue: - # https://github.com/vllm-project/vllm/issues/5938 - CHUNK_SIZE = envs.VLLM_FUSED_MOE_CHUNK_SIZE - - assert _valid_deep_gemm(hidden_states, w1, w2, expert_map) - - if inplace: - out_hidden_states = hidden_states - else: - out_hidden_states = torch.empty_like(hidden_states) - - block_m = dg.get_m_alignment_for_contiguous_layout() - block_shape = [block_m, block_m] - - assert w1_scale is not None - assert w2_scale is not None - - # We attempt to transpose and align offline in Fp8MoEMethod, in which - # case these calls will be nops. Otherwise, they'll be performed every - # time the layer is executed. - w1_scale = dg.get_col_major_tma_aligned_tensor(w1_scale).contiguous() - w2_scale = dg.get_col_major_tma_aligned_tensor(w2_scale).contiguous() - - M_sum = topk_ids.numel() + global_num_experts * (block_m - 1) - M_sum = round_up(M_sum, block_m) - - num_chunks = (num_tokens // CHUNK_SIZE) + 1 - - # We can reuse the memory between cache1 and cache3 because by the time - # we need cache3, we're done with cache1 - workspace13 = torch.empty(M_sum * max(N, K), - device=hidden_states.device, - dtype=hidden_states.dtype) - - workspace1 = workspace13[:M_sum * N].view(M_sum, N) - workspace2 = torch.empty((M_sum, N // 2), - device=hidden_states.device, - dtype=hidden_states.dtype) - workspace3 = workspace13[:M_sum * K].view(M_sum, K) - - for chunk in range(num_chunks): - begin_chunk_idx, end_chunk_idx = (chunk * CHUNK_SIZE, - min((chunk + 1) * CHUNK_SIZE, - num_tokens)) - curr_hidden_states = hidden_states[begin_chunk_idx:end_chunk_idx] - tokens_in_chunk, _ = curr_hidden_states.shape - - if tokens_in_chunk == 0: - break - - curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] - curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] - - a1q_scale: Optional[torch.Tensor] = None - - qcurr_hidden_states, a1q_scale = _fp8_quantize(curr_hidden_states, - a1_scale, block_shape) - - (qcurr_hidden_states, a1q_scale, sorted_token_ids, expert_ids, - inv_perm) = _moe_permute(qcurr_hidden_states, a1q_scale, - curr_topk_ids, global_num_experts, - expert_map, block_m) - - # Adjust the intermediate cache size and config for the last chunk. - # Note that in most cases we only have one chunk so the cache size - # and config are already set correctly and do not need to be adjusted. - if tokens_in_chunk < CHUNK_SIZE and chunk > 0: - curr_M = sorted_token_ids.numel() - workspace1 = _resize_cache(workspace1, (curr_M, N)) - workspace2 = _resize_cache(workspace2, (curr_M, N // 2)) - workspace3 = _resize_cache(workspace3, (curr_M, K)) - - dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( - (qcurr_hidden_states, a1q_scale), (w1, w1_scale), workspace1, - expert_ids) - - if activation == "silu": - torch.ops._C.silu_and_mul(workspace2, workspace1.view(-1, N)) - elif activation == "gelu": - torch.ops._C.gelu_and_mul(workspace2, workspace1.view(-1, N)) - else: - raise ValueError(f"Unsupported FusedMoe activation: {activation}") - - a2q_scale: Optional[torch.Tensor] = None - - qworkspace2, a2q_scale = _fp8_quantize(workspace2, a2_scale, - block_shape) - - dg.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( - (qworkspace2, a2q_scale), (w2, w2_scale), workspace3, expert_ids) - - _moe_unpermute_and_reduce( - out_hidden_states[begin_chunk_idx:end_chunk_idx], - workspace3.view(*workspace3.shape), inv_perm, curr_topk_weights) - - return out_hidden_states + fn = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(quant_dtype=torch.float8_e4m3fn, + block_shape=deep_gemm_block_shape()), + DeepGemmExperts(), + ) + return fn( + hidden_states, + w1, + w2, + topk_weights, + topk_ids, + inplace, + activation, + global_num_experts, + expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py new file mode 100644 index 000000000000..c2db79365931 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -0,0 +1,755 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Fused batched MoE kernel.""" +from typing import Optional + +import torch +import triton +import triton.language as tl + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.fused_moe import ( + get_config_dtype_str, try_get_optimal_moe_config) +from vllm.model_executor.layers.fused_moe.utils import _resize_cache + + +@triton.jit +def moe_mmk( + a_ptrs, + b_ptrs, + K, + expert_id, + a_scale_ptr, + b_scale_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ak, + stride_bk, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Offsets and masks + offs_m, + offs_n, + mask_m, + # Block size for block-wise quantization + group_n: tl.constexpr, + group_k: tl.constexpr, + # Meta-parameters + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr, + compute_type: tl.constexpr, + use_w8a8: tl.constexpr, + use_w8a16: tl.constexpr): + + offs_k = tl.arange(0, BLOCK_K) + + if use_w8a16: + b_scale_ptrs = b_scale_ptr + expert_id * stride_bse + offs_n[ + None, :] * stride_bsn + b_scale = tl.load(b_scale_ptrs) + + if use_w8a8: + # block-wise + if group_k > 0 and group_n > 0: + a_scale_ptrs = a_scale_ptr + offs_m * stride_asm + offs_bsn = offs_n // group_n + b_scale_ptrs = (b_scale_ptr + expert_id * stride_bse + + offs_bsn * stride_bsn) + # tensor-wise + else: + a_scale = tl.load(a_scale_ptr) + b_scale = tl.load(b_scale_ptr + expert_id) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher accuracy. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_K)): + # Load the next block of A and B, generate a mask by checking the + # K dimension. + a = tl.load(a_ptrs, + mask=mask_m[:, None] & (offs_k[None, :] < K - k * BLOCK_K), + other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_K, other=0.0) + # We accumulate along the K dimension. + if use_w8a16: + accumulator = tl.dot(a, b.to(compute_type), acc=accumulator) + elif use_w8a8: + if group_k > 0 and group_n > 0: + k_start = k * BLOCK_K + offs_ks = k_start // group_k + a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask, + mask=mask_m, + other=0.0) + b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) + + accumulator += tl.dot(a, b) * a_scale[:, + None] * b_scale[None, :] + else: + if use_w8a8: + # acc used to enable fp8_fast_accum + accumulator = tl.dot(a, b, acc=accumulator) + else: + accumulator += tl.dot(a, b) + else: + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_K * stride_ak + b_ptrs += BLOCK_K * stride_bk + + if use_w8a16: + accumulator = (accumulator * b_scale).to(compute_type) + elif use_w8a8: + if group_k > 0 and group_n > 0: + accumulator = accumulator.to(compute_type) + else: + accumulator = (accumulator * a_scale * b_scale).to(compute_type) + else: + accumulator = accumulator.to(compute_type) + + return accumulator + + +@triton.jit +def expert_triton_kernel( + a_ptr, #[max_tokens, K] + b_ptr, #[K, N] + c_ptr, #[max_tokens, N] + expert_id, + compute_type: tl.constexpr, + # Dimensions + M, + N, + K, + # Quantization data + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # strides + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Blockwise quantization data + group_n, + group_k, + # Quantization schemes + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + # Kernel config + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr): + + offs_m = tl.arange(0, BLOCK_M) + offs_n = tl.arange(0, BLOCK_N) % N + offs_k = tl.arange(0, BLOCK_K) + mask_m = offs_m < M + + a_ptrs = a_ptr + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak + b_ptrs = b_ptr + offs_k[:, None] * stride_bk + offs_n[None, :] * stride_bn + + accumulator = moe_mmk( + a_ptrs, + b_ptrs, + K, + expert_id, + a_scale_ptr, + b_scale_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ak, + stride_bk, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Offsets and masks + offs_m, + offs_n, + mask_m, + # Block size for block-wise quantization + group_n, + group_k, + # Meta-parameters + BLOCK_M, + BLOCK_N, + BLOCK_K, + compute_type, + use_fp8_w8a8, + use_int8_w8a16) + + # store in C + offs_cn = tl.arange(0, BLOCK_N) + c_ptrs = c_ptr + offs_m[:, None] * stride_cm + offs_cn[None, :] * stride_cn + c_mask = mask_m[:, None] & (offs_cn[None, :] < N) + tl.store(c_ptrs, accumulator, mask=c_mask) + + +@triton.jit +def batched_triton_kernel( + a_ptr, # [E, max_num_tokens, K] + b_ptr, # [E, K, N] + c_ptr, # [E, max_num_tokens, N] + expert_num_tokens, # [E] + compute_type: tl.constexpr, + # Dimensions + max_num_tokens, + K, + N, + # Quantization data + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # The stride variables represent how much to increase the ptr by when + # moving by 1 element in a particular dimension. E.g. `stride_am` is + # how much to increase `a_ptr` by to get the element one row down + # (A has M rows). + stride_ae, + stride_am, + stride_ak, + stride_be, + stride_bk, + stride_bn, + stride_ce, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Blockwise quantization data + group_n: tl.constexpr, + group_k: tl.constexpr, + # Quantization schemes + use_fp8_w8a8: tl.constexpr, + use_int8_w8a16: tl.constexpr, + # Kernel config + BLOCK_M: tl.constexpr, + BLOCK_N: tl.constexpr, + BLOCK_K: tl.constexpr): + expert_id = tl.program_id(axis=0) + e_num_tokens = tl.load(expert_num_tokens + expert_id) + if e_num_tokens == 0: + # Early exit + return + + pid_mn = tl.program_id(axis=1) + #num_pid_m = tl.cdiv(max_num_tokens, BLOCK_M) + num_pid_n = tl.cdiv(N, BLOCK_N) + pid_m = pid_mn // num_pid_n + pid_n = pid_mn % num_pid_n + + cta_m_start = pid_m * BLOCK_M + cta_n_start = pid_n * BLOCK_N + if cta_m_start >= e_num_tokens: + # Early exit + return + + cta_m_size = min(BLOCK_M, e_num_tokens - cta_m_start) + cta_n_size = min(BLOCK_N, N - cta_n_start) + + a_ptr = a_ptr + expert_id * stride_ae + cta_m_start * stride_am + b_ptr = b_ptr + expert_id * stride_be + cta_n_start * stride_bn + c_ptr = (c_ptr + expert_id * stride_ce + cta_m_start * stride_cm + + cta_n_start * stride_cn) + + expert_triton_kernel( + a_ptr, + b_ptr, + c_ptr, + expert_id, + compute_type, + cta_m_size, # M + cta_n_size, # N + K, # K + a_scale_ptr, + b_scale_ptr, + b_zp_ptr, + # Strides + stride_am, + stride_ak, + stride_bk, + stride_bn, + stride_cm, + stride_cn, + stride_asm, + stride_ask, + stride_bse, + stride_bsk, + stride_bsn, + # Blockwise quantization data + group_n, + group_k, + # Quantization schemes + use_fp8_w8a8, + use_int8_w8a16, + # Kernel config + BLOCK_M, + BLOCK_N, + BLOCK_K) + + +def invoke_moe_batched_triton_kernel( + A: torch.Tensor, # [E, max_tokens, K] + B: torch.Tensor, # [E, K, N] + C: torch.Tensor, # [E, max_tokens, N] + expert_num_tokens: torch.Tensor, # [E] + compute_type: tl.dtype, + # Quantization data + A_scale: torch.Tensor, + B_scale: torch.Tensor, + B_zp: torch.Tensor, + # Quantization schemes + use_fp8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + config: dict[str, int], + block_shape: Optional[list[int]] = None): + + assert not use_int4_w4a16 + max_num_tokens = A.size(1) + K = A.size(2) + N = C.size(2) + + BLOCK_M = config['BLOCK_SIZE_M'] + BLOCK_N = config['BLOCK_SIZE_N'] + BLOCK_K = config['BLOCK_SIZE_K'] + assert (torch.compiler.is_compiling() + or torch.cuda.is_current_stream_capturing() + or max_num_tokens % BLOCK_M == 0) + + grid = (expert_num_tokens.size(0), triton.cdiv(max_num_tokens, BLOCK_M) * + triton.cdiv(B.size(1), BLOCK_N)) + + batched_triton_kernel[grid]( + A, + B, + C, + expert_num_tokens, + compute_type, + # Dimensions + max_num_tokens, + K, + N, + # Quantization data + A_scale, + B_scale, + B_zp, + # Strides + A.stride(0), + A.stride(1), + A.stride(2), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(0), + C.stride(1), + C.stride(2), + A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + # Blockwise quantization data + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + # Quantization schemes + use_fp8_w8a8, + use_int8_w8a16, + # Kernel config + BLOCK_M=BLOCK_M, + BLOCK_N=BLOCK_N, + BLOCK_K=BLOCK_K) + + +class BatchedPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + """ + A reference prepare/finalize class that reorganizes the tokens into + expert batched format, i.e. E x max_num_tokens x K. This is the format + that the PPLX dispatch/combine kernels use. + """ + + def __init__(self, max_num_tokens: Optional[int], world_size: int, + dp_size: int, rank: int): + super().__init__() + self.world_size = world_size + self.dp_size = dp_size + self.rank = rank + self.max_num_tokens = max_num_tokens + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + assert a1.dim() == 2 + assert topk_ids.dim() == 2 + assert topk_ids.size(0) == a1.size(0) + + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, \ + "apply_router_weight_on_input is only implemented for topk=1" + a1.mul_(topk_weights.to(a1.dtype)) + + num_tokens, hidden_dim = a1.size() + topk = topk_ids.size(1) + + if self.max_num_tokens is None: + tokens_per_expert = torch.bincount(topk_ids.view(-1), + minlength=num_experts) + self.max_num_tokens = int(tokens_per_expert.max().item()) + else: + tokens_per_expert = torch.zeros(num_experts, + dtype=torch.int, + device=a1.device) + + assert num_experts % self.world_size == 0 + + num_local_experts = num_experts // self.world_size + + b_a1 = torch.zeros( + (num_local_experts, self.max_num_tokens, hidden_dim), + dtype=a1.dtype, + device=a1.device) + + first_expert = num_local_experts * self.rank + last_expert = first_expert + num_local_experts + + for expert_id in range(first_expert, last_expert): + topks = torch.any(topk_ids == expert_id, dim=1).flatten() + rows = torch.count_nonzero(topks.flatten()) + b_a1[expert_id - + first_expert, :rows, :] = a1[:topks.numel()][topks] + tokens_per_expert[expert_id - first_expert] = rows + + return b_a1, a1_scale, tokens_per_expert + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> None: + num_tokens = topk_ids.size(0) + num_local_experts = fused_expert_output.size(0) + K = fused_expert_output.size(-1) + assert output.size(0) == num_tokens and output.size(1) == K + + output.fill_(0) + + first_expert = num_local_experts * self.rank + last_expert = first_expert + num_local_experts + + for expert_id in range(first_expert, last_expert): + matching_tokens = topk_ids == expert_id + topks = torch.any(matching_tokens, dim=1).flatten() + rows = torch.count_nonzero(topks) + rhs = fused_expert_output[expert_id - first_expert, :rows, :] + if not apply_router_weight_on_input: + rhs.mul_(topk_weights[matching_tokens].view(rhs.size(0), 1)) + output[topks] = output[topks] + rhs + + +class BatchedExperts(mk.FusedMoEPermuteExpertsUnpermute): + """ + A reference MoE expert class that operates on expert batched format, + i.e. E x max_num_tokens x K. This is the format that the pplx + dispatch/combine kernels use. + """ + + def __init__( + self, + world_size: int, + dp_size: int, + max_num_tokens: Optional[int] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + block_shape: Optional[list[int]] = None, + block_m: Optional[int] = None, + ): + super().__init__() + assert block_shape is None + assert block_m is None + assert not use_fp8_w8a8, "NYI" + assert not use_int8_w8a8, "NYI" + assert not use_int8_w8a16, "NYI" + assert not use_int4_w4a16, "NYI" + self.max_num_tokens = max_num_tokens + self.world_size = world_size + self.dp_size = dp_size + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + assert a.dim() == 2 + num_dp = self.world_size // self.dp_size + max_num_tokens = a.size( + 0) if self.max_num_tokens is None else self.max_num_tokens + #print(f"WORKSPACE {max_num_tokens} {num_dp}") + workspace13 = num_experts * max_num_tokens * num_dp * K + workspace2 = max_num_tokens * num_dp * N + return (workspace13, workspace2, a.dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + assert hidden_states.dim() == 3 + assert expert_num_tokens is not None + hidden_dim = hidden_states.size(-1) + + if self.max_num_tokens is None: + max_num_tokens = hidden_states.size(1) + else: + max_num_tokens = self.max_num_tokens + + num_dp = self.world_size // self.dp_size + num_experts = global_num_experts + out = _resize_cache(workspace13, + (num_experts, max_num_tokens * num_dp, hidden_dim)) + num_local_experts = w1.size(0) + assert num_local_experts == w1.size(0), ( + f"{num_local_experts} == {w1.size(0)}") + + N = w1.size(1) // 2 + + # Not cudagraph friendly + assert (torch.compiler.is_compiling() + or torch.cuda.is_current_stream_capturing() + or torch.all(expert_num_tokens <= max_num_tokens * num_dp)), ( + f"{expert_num_tokens} <= {max_num_tokens * num_dp}") + + for expert in range(num_local_experts): + # Indexing expert_num_tokens doesn't work w/cudagraphs or inductor + if (torch.compiler.is_compiling() + or torch.cuda.is_current_stream_capturing()): + num = max_num_tokens * num_dp + else: + num = int(expert_num_tokens[expert].item()) + tmp = _resize_cache(workspace2, (num, N)) + input = hidden_states[expert, :num, :] @ w1[expert].transpose(0, 1) + self.activation(activation, tmp, input) + out[expert, :num, :] = tmp @ w2[expert].transpose(0, 1) + + return out + + +class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + """ + A Triton based MoE expert class that operates on expert batched format, + i.e. E x max_num_tokens x K. This is the format that the pplx + dispatch/combine kernels use. + """ + + def __init__( + self, + max_num_tokens: Optional[int] = None, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + block_shape: Optional[list[int]] = None, + world_size: int = 1, + dp_size: int = 1, + ): + super().__init__() + self.use_fp8_w8a8 = use_fp8_w8a8 + self.use_int8_w8a8 = use_int8_w8a8 + self.use_int4_w4a16 = use_int4_w4a16 + self.use_int8_w8a16 = use_int8_w8a16 + self.block_shape = block_shape + self.max_num_tokens = max_num_tokens + assert not use_int8_w8a8, "NYI" + assert not use_int4_w4a16, "NYI" + self.world_size = world_size + self.dp_size = dp_size + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + assert a.dim() == 2 + num_dp = self.world_size // self.dp_size + max_num_tokens = a.size( + 0) if self.max_num_tokens is None else self.max_num_tokens + workspace13 = num_experts * max_num_tokens * num_dp * max(K, N) + workspace2 = num_experts * max_num_tokens * num_dp * (N // 2) + return (workspace13, workspace2, a.dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + # Check constraints. + if self.use_int4_w4a16: + assert hidden_states.size(-1) // 2 == w1.size(2), ( + "Hidden size mismatch") + else: + assert hidden_states.size(-1) == w1.size(2), ( + f"Hidden size mismatch {hidden_states.size(-1)} " + f"!= {w1.size(2)}") + + assert hidden_states.is_contiguous( + ), "Hidden_states must be contiguous" + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn + ] + + # TODO: num_tokens -> max_num_tokens? + E, num_tokens, N, K, top_k_num = mk._moe_problem_size( + hidden_states, w1, w2, topk_ids) + + assert w1.size(0) == E + assert w2.size(0) == E + + config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + dtype=hidden_states.dtype) + + config = try_get_optimal_moe_config( + w1.size(), + w2.size(), + top_k_num, + config_dtype, + num_tokens, + block_shape=self.block_shape, + ) + + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + elif hidden_states.dtype == torch.float8_e4m3fn: + compute_type = tl.bfloat16 + else: + raise ValueError( + f"Unsupported compute_type: {hidden_states.dtype}") + + #print(f"shape: E={E}, M={num_tokens}, N={N}, K={K}, top_k={top_k_num}") + # We can reuse the memory between these because by the time we need + # cache3, we're done with cache1 + intermediate_cache1 = _resize_cache(workspace13, (E, num_tokens, N)) + intermediate_cache2 = _resize_cache(workspace2, + (E, num_tokens, N // 2)) + intermediate_cache3 = _resize_cache(workspace13, (E, num_tokens, K)) + + # MM1 + invoke_moe_batched_triton_kernel(A=hidden_states, + B=w1, + C=intermediate_cache1, + expert_num_tokens=expert_num_tokens, + compute_type=compute_type, + A_scale=a1q_scale, + B_scale=w1_scale, + B_zp=w1_zp, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + config=config, + block_shape=self.block_shape) + + # TODO: would be nice to use expert_num_tokens here to reduce + # garbage compute + self.activation(activation, intermediate_cache2.view(-1, N // 2), + intermediate_cache1.view(-1, N)) + + #qintermediate_cache2 = intermediate_cache2 + a2q_scale = a2_scale + # TODO (varun) : support w8a8 + assert not self.use_fp8_w8a8 + #if self.use_fp8_w8a8: + # qintermediate_cache2, a2q_scale = _fp8_quantize( + # intermediate_cache2, a2_scale, self.block_shape) + + invoke_moe_batched_triton_kernel(A=intermediate_cache2, + B=w2, + C=intermediate_cache3, + expert_num_tokens=expert_num_tokens, + compute_type=compute_type, + A_scale=a2q_scale, + B_scale=w2_scale, + B_zp=w2_zp, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + config=config, + block_shape=self.block_shape) + + return intermediate_cache3 diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 7bf4243305ac..78f8eb926dc8 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -8,16 +8,17 @@ import torch import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( _valid_deep_gemm, deep_gemm_moe_fp8) from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( moe_align_block_size) -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - per_token_group_quant_fp8) -from vllm.model_executor.layers.quantization.utils.int8_utils import ( - per_token_group_quant_int8, per_token_quant_int8) +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP) +from vllm.model_executor.layers.fused_moe.utils import ( + _resize_cache, moe_kernel_quantize_input) from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils import direct_register_custom_op @@ -484,6 +485,20 @@ def invoke_fused_moe_kernel(A: torch.Tensor, assert topk_weights is None or topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 + if use_fp8_w8a8 or use_int8_w8a8: + assert B_scale is not None + assert (block_shape is None or triton.cdiv(B.shape[-2], block_shape[0]) + == B_scale.shape[-2]) + assert (block_shape is None or triton.cdiv(B.shape[-1], block_shape[1]) + == B_scale.shape[-1]) + + elif use_int8_w8a16 or use_int4_w4a16: + assert B_scale is not None + assert block_shape is None or block_shape[0] == 0 + else: + assert A_scale is None + assert B_scale is None + M = A.shape[0] num_tokens = M * top_k @@ -855,6 +870,7 @@ def fused_topk( gating_output: torch.Tensor, topk: int, renormalize: bool, + indices_type: Optional[torch.dtype] = None, ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: assert hidden_states.shape[0] == gating_output.shape[0], ( "Number of tokens mismatch") @@ -865,10 +881,11 @@ def fused_topk( topk, dtype=torch.float32, device=hidden_states.device) - topk_ids = torch.empty(M, - topk, - dtype=torch.int32, - device=hidden_states.device) + topk_ids = torch.empty( + M, + topk, + dtype=torch.int32 if indices_type is None else indices_type, + device=hidden_states.device) token_expert_indices = torch.empty(M, topk, dtype=torch.int32, @@ -962,6 +979,20 @@ def get_config_dtype_str( return None +# TODO (bnell): use scalar_type instead of bools? +def get_config_qtype( + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, +) -> Optional[torch.dtype]: + if use_fp8_w8a8: + return torch.float8_e4m3fn + elif use_int8_w8a8: + return torch.int8 + return None + + def inplace_fused_experts(hidden_states: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor, @@ -1128,7 +1159,10 @@ def fused_experts(hidden_states: torch.Tensor, a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, allow_deep_gemm: bool = False) -> torch.Tensor: - if (allow_deep_gemm and use_fp8_w8a8 + # For now, disable DeepGemm for small N (<= 512) until better + # permute/unpermute ops are available. + N = w1.shape[1] + if (allow_deep_gemm and use_fp8_w8a8 and N > 512 and _valid_deep_gemm(hidden_states, w1, w2, expert_map)): assert apply_router_weight_on_input is False return deep_gemm_moe_fp8( @@ -1145,6 +1179,7 @@ def fused_experts(hidden_states: torch.Tensor, w2_scale=w2_scale, a1_scale=a1_scale, a2_scale=a2_scale, + apply_router_weight_on_input=apply_router_weight_on_input, ) else: return dispatch_fused_experts_func(inplace)( @@ -1171,87 +1206,37 @@ def fused_experts(hidden_states: torch.Tensor, block_shape=block_shape) -def moe_kernel_prepare_input( - A: torch.Tensor, - B: torch.Tensor, - A_scale: Optional[torch.Tensor], - B_scale: Optional[torch.Tensor], - use_fp8_w8a8: bool, - use_int8_w8a8: bool, - use_int8_w8a16: bool, - use_int4_w4a16: bool, - per_channel_quant: bool, +def fused_experts_impl( + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + apply_router_weight_on_input: bool = False, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + per_channel_quant: bool = False, + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, block_shape: Optional[list[int]] = None, -) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - if use_fp8_w8a8: - assert B_scale is not None - if block_shape is None: - # If weights are per-channel (per_channel_quant=True), then - # activations apply per-token quantization. Otherwise, assume - # activation tensor-wise fp8 quantization, dynamic or static - A, A_scale = ops.scaled_fp8_quant( - A, A_scale, use_per_token_if_dynamic=per_channel_quant) - else: - # activation block-wise fp8 quantization - assert len(block_shape) == 2 - _, block_k = block_shape[0], block_shape[1] - A, A_scale = per_token_group_quant_fp8(A, block_k) - assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] - # assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] - # assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] - elif use_int8_w8a8: - assert B_scale is not None - if block_shape is None: - # activation channel-wise int8 quantization - assert (per_channel_quant - ), "int8 quantization only supports block or channel-wise" - A, A_scale = per_token_quant_int8(A) - else: - # activation block-wise int8 quantization - assert len(block_shape) == 2 - _, block_k = block_shape[0], block_shape[1] - A, A_scale = per_token_group_quant_int8(A, block_k) - assert triton.cdiv(A.shape[-1], block_k) == A_scale.shape[-1] - # assert triton.cdiv(B.shape[-2], block_n) == B_scale.shape[-2] - # assert triton.cdiv(B.shape[-1], block_k) == B_scale.shape[-1] - elif use_int8_w8a16 or use_int4_w4a16: - assert B_scale is not None - assert block_shape is None or block_shape[0] == 0 - else: - assert A_scale is None - assert B_scale is None - - return A, A_scale - - -def fused_experts_impl(hidden_states: torch.Tensor, - w1: torch.Tensor, - w2: torch.Tensor, - topk_weights: torch.Tensor, - topk_ids: torch.Tensor, - inplace: bool = False, - activation: str = "silu", - apply_router_weight_on_input: bool = False, - use_fp8_w8a8: bool = False, - use_int8_w8a8: bool = False, - use_int8_w8a16: bool = False, - use_int4_w4a16: bool = False, - per_channel_quant: bool = False, - global_num_experts: int = -1, - expert_map: Optional[torch.Tensor] = None, - w1_scale: Optional[torch.Tensor] = None, - w2_scale: Optional[torch.Tensor] = None, - w1_zp: Optional[torch.Tensor] = None, - w2_zp: Optional[torch.Tensor] = None, - a1_scale: Optional[torch.Tensor] = None, - a2_scale: Optional[torch.Tensor] = None, - block_shape: Optional[list[int]] = None): +) -> torch.Tensor: # Check constraints. if use_int4_w4a16: assert hidden_states.shape[1] // 2 == w1.shape[ 2], "Hidden size mismatch" else: - assert hidden_states.shape[1] == w1.shape[2], "Hidden size mismatch" + assert hidden_states.shape[1] == w1.shape[2], ( + f"Hidden size mismatch {hidden_states.shape[1]} != {w1.shape[2]}") assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" @@ -1261,7 +1246,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, torch.float32, torch.float16, torch.bfloat16 ] - num_tokens, _ = hidden_states.shape + num_tokens = hidden_states.shape[0] E, N, _ = w1.shape K = w2.shape[1] if global_num_experts == -1: @@ -1276,6 +1261,11 @@ def fused_experts_impl(hidden_states: torch.Tensor, use_int4_w4a16=use_int4_w4a16, dtype=hidden_states.dtype) + qtype = get_config_qtype(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16) + get_config_func = functools.partial( try_get_optimal_moe_config, w1.shape, @@ -1338,15 +1328,10 @@ def fused_experts_impl(hidden_states: torch.Tensor, curr_topk_ids = topk_ids[begin_chunk_idx:end_chunk_idx] curr_topk_weights = topk_weights[begin_chunk_idx:end_chunk_idx] - qcurr_hidden_states, qa1_scale = moe_kernel_prepare_input( + qcurr_hidden_states, a1q_scale = moe_kernel_quantize_input( A=curr_hidden_states, - B=w1, A_scale=a1_scale, - B_scale=w1_scale, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, + qtype=qtype, per_channel_quant=per_channel_quant, block_shape=block_shape) @@ -1357,7 +1342,7 @@ def fused_experts_impl(hidden_states: torch.Tensor, invoke_fused_moe_kernel(qcurr_hidden_states, w1, intermediate_cache1, - qa1_scale, + a1q_scale, w1_scale, w1_zp, curr_topk_weights, @@ -1384,22 +1369,17 @@ def fused_experts_impl(hidden_states: torch.Tensor, else: raise ValueError(f"Unsupported FusedMoe activation: {activation}") - qintermediate_cache2, qa2_scale = moe_kernel_prepare_input( + qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( A=intermediate_cache2, - B=w2, A_scale=a2_scale, - B_scale=w2_scale, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - use_int4_w4a16=use_int4_w4a16, + qtype=qtype, per_channel_quant=per_channel_quant, block_shape=block_shape) invoke_fused_moe_kernel(qintermediate_cache2, w2, intermediate_cache3, - qa2_scale, + a2q_scale, w2_scale, w2_zp, curr_topk_weights, @@ -1534,3 +1514,209 @@ def fused_moe( a1_scale=a1_scale, a2_scale=a2_scale, block_shape=block_shape) + + +class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__( + self, + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + per_channel_quant: bool, + block_shape: Optional[list[int]] = None, + block_m: Optional[int] = None, + ): + super().__init__() + self.use_fp8_w8a8 = use_fp8_w8a8 + self.use_int4_w4a16 = use_int4_w4a16 + self.use_int8_w8a8 = use_int8_w8a8 + self.use_int8_w8a16 = use_int8_w8a16 + self.block_shape = block_shape + self.block_m = block_m + self.qtype = get_config_qtype(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16) + self.per_channel_quant = per_channel_quant + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + factor = num_experts if a.dim() == 3 else 1 + workspace1 = M * topk * max(N * 2, K) * factor + workspace2 = M * topk * N * factor + return (workspace1, workspace2, a.dtype) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + # Check constraints. + if self.use_int4_w4a16: + assert hidden_states.size(-1) // 2 == w1.size(2), ( + "Hidden size mismatch") + else: + assert hidden_states.size(-1) == w1.size(2), \ + (f"Hidden size mismatch {hidden_states.size(-1)} " + f"!= {w1.size(2)}") + + assert hidden_states.is_contiguous( + ), "Hidden_states must be contiguous" + assert hidden_states.dim() == 2 + assert w1.stride(-1) == 1, "Stride of last dimension must be 1" + assert w2.stride(-1) == 1, "Stride of last dimension must be 1" + assert hidden_states.dtype in [ + torch.float32, torch.float16, torch.bfloat16, torch.float8_e4m3fn + ] + + E, num_tokens, N, K, top_k_num = mk._moe_problem_size( + hidden_states, w1, w2, topk_ids) + + if global_num_experts == -1: + global_num_experts = E + + config_dtype = get_config_dtype_str(use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + dtype=hidden_states.dtype) + + config = try_get_optimal_moe_config( + w1.shape, + w2.shape, + top_k_num, + config_dtype, + num_tokens, + block_shape=self.block_shape, + ) + + if hidden_states.dtype == torch.bfloat16: + compute_type = tl.bfloat16 + elif hidden_states.dtype == torch.float16: + compute_type = tl.float16 + elif hidden_states.dtype == torch.float32: + compute_type = tl.float32 + elif hidden_states.dtype == torch.float8_e4m3fn: + compute_type = tl.bfloat16 + else: + raise ValueError( + f"Unsupported compute_type: {hidden_states.dtype}") + + # We can reuse the memory between these because by the time we need + # cache3, we're done with cache1 + intermediate_cache1 = _resize_cache(workspace13, + (num_tokens, top_k_num, N)) + intermediate_cache2 = _resize_cache(workspace2, + (num_tokens * top_k_num, N // 2)) + intermediate_cache3 = _resize_cache(workspace13, + (num_tokens, top_k_num, K)) + + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(topk_ids, config['BLOCK_SIZE_M'], + global_num_experts, expert_map)) + + invoke_fused_moe_kernel(hidden_states, + w1, + intermediate_cache1, + a1q_scale, + w1_scale, + w1_zp, + None, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + False, + top_k_num, + config, + compute_type=compute_type, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a8=self.use_int8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + per_channel_quant=self.per_channel_quant, + block_shape=self.block_shape) + + self.activation(activation, intermediate_cache2, + intermediate_cache1.view(-1, N)) + + a2q_scale: Optional[torch.Tensor] = None + + qintermediate_cache2, a2q_scale = moe_kernel_quantize_input( + intermediate_cache2, a2_scale, self.qtype, self.per_channel_quant, + self.block_shape) + + invoke_fused_moe_kernel(qintermediate_cache2, + w2, + intermediate_cache3, + a2q_scale, + w2_scale, + w2_zp, + None, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + False, + 1, + config, + compute_type=compute_type, + use_fp8_w8a8=self.use_fp8_w8a8, + use_int8_w8a8=self.use_int8_w8a8, + use_int8_w8a16=self.use_int8_w8a16, + use_int4_w4a16=self.use_int4_w4a16, + per_channel_quant=self.per_channel_quant, + block_shape=self.block_shape) + + return intermediate_cache3 + + +def modular_triton_fused_moe( + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + per_channel_quant: bool, + block_shape: Optional[list[int]] = None, +) -> mk.FusedMoEModularKernel: + qtype = get_config_qtype( + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + ) + return mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP( + quant_dtype=qtype, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + ), + TritonExperts( + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + use_int4_w4a16=use_int4_w4a16, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + ), + ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 14f360e3bbf3..d083e0040c0e 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1,15 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 +import importlib +import threading from abc import abstractmethod +from dataclasses import dataclass from enum import Enum from typing import Callable, Optional +from weakref import WeakValueDictionary import torch import torch.nn.functional as F from torch.nn.parameter import UninitializedParameter import vllm.envs as envs -from vllm.config import get_current_vllm_config +from vllm.config import ParallelConfig, get_current_vllm_config from vllm.distributed import (get_dp_group, get_ep_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, @@ -26,8 +30,17 @@ from vllm.platforms.interface import CpuArchEnum from vllm.utils import direct_register_custom_op +has_pplx = importlib.util.find_spec("pplx_kernels") is not None + if current_platform.is_cuda_alike(): - from .fused_moe import fused_experts + from .fused_batched_moe import (BatchedPrepareAndFinalize, + BatchedTritonExperts) + from .fused_moe import TritonExperts, fused_experts + from .modular_kernel import (FusedMoEModularKernel, + FusedMoEPermuteExpertsUnpermute, + FusedMoEPrepareAndFinalize) + if has_pplx: + from .pplx_prepare_finalize import PplxPrepareAndFinalize else: fused_experts = None # type: ignore if is_rocm_aiter_moe_enabled(): @@ -42,6 +55,179 @@ fused_moe_pallas = None # type: ignore logger = init_logger(__name__) +# Note: this limit is somewhat arbitrary and might be changed later. +# The size of the activations will be E x MOE_DP_CHUNK_SIZE x hidden_dim. +MOE_DP_CHUNK_SIZE = 256 + + +@dataclass +class FusedMoEParallelConfig: + tp_size: int + dp_size: int + ep_size: int + tp_rank: int + dp_rank: int + ep_rank: int + + use_ep: bool # whether to use EP or not + + @property + def use_pplx_kernels(self): + return self.dp_size > 1 and self.use_ep and has_pplx + + @staticmethod + def make(tp_size_: int, dp_size_: int, + vllm_parallel_config: ParallelConfig) -> "FusedMoEParallelConfig": + """ + Determine MoE parallel configuration. Based on the input tp_size_, + dp_size_, ep_size_ and vllm's parallel config, determine what + level's of parallelism to use in the fused moe layer. + + Args: + tp_size_ (int): tp_size passed into the FusedMoE constructor. + dp_size_ (int): dp_size passed into the FusedMoE constructor. + ep_size_ (int): ep_size passed into the FusedMoE constructor. + vllm_parallel_config (ParallelConfig): vllm's parallel config + object. + + Examples: + When there is no parallelism requested, i.e. tp_size_ = dp_size_ = 1, + we simply return the sizes unaltered and the ranks set to 0. + + Expert Parallelism is considered only when either dp_size_ or tp_size_ + is non trivial. + + When TP = 2, DP = 1 and EP = False, the configuration on different + devices, + - device 0 : TP = {2, 0} DP = {1, 0} EP = {1, 0} // + legend : {size, rank} + - device 1 : TP = {2, 1} DP = {1, 0} EP = {1, 0} + - Comment : Tensors are sharded across 2 devices. + + When TP = 1, DP = 2 and EP = False, the configuration on different + devices, + - device 0 : TP = {2, 0} DP = {2, 0} EP = {1, 0} + - device 1 : TP = {2, 1} DP = {2, 1} EP = {1, 0} + - Comment: There are 2 engine instances and the tensors are sharded + across 2 decvices. + + When TP = 2, DP = 2 and EP = False, the configuration on different + devices, + - device 0: TP = {4, 0} DP = {2, 0} EP = {1, 0} + - device 1: TP = {4, 1} DP = {2, 0} EP = {1, 0} + - device 2: TP = {4, 2} DP = {2, 1} EP = {1, 0} + - device 3: TP = {4, 3} DP = {2, 1} EP = {1, 0} + - Comment: There are 2 engine instances and the tensors are sharded + across 4 devices. + + When, TP = 2, DP = 1 and EP = True, the configuration on different + devices, + - device 0: TP = {1, 0} DP = {1, 0} EP = {2, 0} + - device 1: TP = {1, 0} DP = {1, 0} EP = {2, 1} + - Comment: The experts are split between the 2 devices. + + When, TP = 1, DP = 2 and EP = True, the configuration on different + devices, + - device 0: TP = {1, 0} DP = {2, 0} EP = {2, 0} + - device 1: TP = {1, 0} DP = {2, 1} EP = {2, 1} + - Comment: There are 2 engine instances and the experts are split + between the 2 devices. + + When TP = 2, DP = 2 and EP = True, the configuration on different + devices, + - device 0: TP = {1, 0} DP = {2, 0} EP = {4, 0} + - device 1: TP = {1, 0} DP = {2, 0} EP = {4, 1} + - device 2: TP = {1, 0} DP = {2, 1} EP = {4, 2} + - device 3: TP = {1, 0} DP = {2, 1} EP = {4, 3} + - Comment: There are 2 engine instances and the experts are split + between the 4 devices. + """ + + def flatten_tp_across_dp(dp_rank: int): + tp_rank = 0 if tp_size_ == 1 else get_tensor_model_parallel_rank() + # There are actually dp_size_ * tp_size_ devices. Update tp_size + # and tp_rank so we shard across all devices. + tp_size = dp_size_ * tp_size_ + tp_rank = dp_rank * tp_size_ + tp_rank + return tp_size, tp_rank + + use_ep = (dp_size_ * tp_size_ > 1 + and vllm_parallel_config.enable_expert_parallel) + + dp_size = dp_size_ + dp_rank = get_dp_group().rank_in_group if dp_size > 1 else 0 + tp_size, tp_rank = flatten_tp_across_dp(dp_rank) + + if not use_ep: + return FusedMoEParallelConfig(tp_size=tp_size, + tp_rank=tp_rank, + dp_size=dp_size, + dp_rank=dp_rank, + ep_size=1, + ep_rank=0, + use_ep=False) + # DP + EP / TP + EP / DP + TP + EP + assert use_ep + # In EP, each device owns a set of experts fully. There is no tensor + # parallel update tp_size, tp_rank, ep_size and ep_rank to reflect that. + ep_size = tp_size + ep_rank = tp_rank + return FusedMoEParallelConfig(tp_size=1, + tp_rank=0, + dp_size=dp_size, + dp_rank=dp_rank, + ep_size=ep_size, + ep_rank=ep_rank, + use_ep=True) + + +# Adapted from pplx-kernels tests/all_to_all_utils.py +@dataclass +class MoEConfig: + num_experts: int + experts_per_token: int + hidden_dim: int + + num_local_experts: int + moe_parallel_config: FusedMoEParallelConfig + + in_dtype: torch.dtype # The activation type. + + # TODO: add more quantization params, blocked, per-token, etc. + block_size: int = 128 + + @property + def tp_size(self): + return self.moe_parallel_config.tp_size + + @property + def dp_size(self): + return self.moe_parallel_config.dp_size + + @property + def ep_size(self): + return self.moe_parallel_config.ep_size + + @property + def tp_rank(self): + return self.moe_parallel_config.tp_rank + + @property + def dp_rank(self): + return self.moe_parallel_config.dp_rank + + @property + def ep_rank(self): + return self.moe_parallel_config.ep_rank + + @property + def use_ep(self): + return self.moe_parallel_config.use_ep + + @property + def use_pplx_kernels(self): + return self.moe_parallel_config.use_pplx_kernels + class FusedMoeWeightScaleSupported(Enum): TENSOR = "tensor" @@ -58,6 +244,14 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int, params_dtype: torch.dtype, **extra_weight_attrs): raise NotImplementedError + def set_prepare_finalize( + self, + dp_size: int, + world_size: int, + prepare_finalize: FusedMoEPrepareAndFinalize, + ) -> bool: + return False + @abstractmethod def apply( self, @@ -80,12 +274,54 @@ def apply( raise NotImplementedError +class AllToAllCache: + + def __init__(self): + self._cache: WeakValueDictionary = WeakValueDictionary() + self._lock = threading.RLock() # Reentrant lock for thread safety + + def destroy(self): + with self._lock: + # TODO: can we do del self._cache? + for _, a2a in self._cache.items(): + a2a.destroy() + + def get_or_create(self, **kwargs): + assert has_pplx + import pplx_kernels as pplx + + # Create a hashable key from the kwargs + key = tuple(sorted((k, v) for k, v in kwargs.items())) + + with self._lock: + instance = self._cache.get(key) + if instance is None: + # TODO (varun): Add support to switch to intranode + # when all communications are within the same + # node. + logger.debug("Create AllToAll %s", kwargs) + instance = pplx.AllToAll.internode(**kwargs) + self._cache[key] = instance + return instance + + +# Global singleton +_all_to_all_cache = AllToAllCache() + + +# Factory function as a cleaner interface +def get_all_to_all(**kwargs): + return _all_to_all_cache.get_or_create(**kwargs) + + @CustomOp.register("unquantized_fused_moe") class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): """MoE method without quantization.""" - def __init__(self): + def __init__(self, moe: MoEConfig): super().__init__() + self.fused_experts = fused_experts + self.moe = moe self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled() if self.rocm_aiter_moe_enabled: @@ -193,6 +429,47 @@ def apply( activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) + def set_prepare_finalize( + self, + dp_size: int, + world_size: int, + prepare_finalize: FusedMoEPrepareAndFinalize, + ) -> bool: + assert self.fused_experts == fused_experts + + experts: Optional[FusedMoEPermuteExpertsUnpermute] = None + + if isinstance(prepare_finalize, + (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)): + logger.debug("BatchedTritonExperts %s", self.moe) + experts = BatchedTritonExperts( + max_num_tokens=MOE_DP_CHUNK_SIZE, + world_size=world_size, + dp_size=dp_size, + use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + ) + else: + logger.debug("TritonExperts %s", self.moe) + experts = TritonExperts( + use_fp8_w8a8=False, + use_int8_w8a8=False, + use_int8_w8a16=False, + use_int4_w4a16=False, + block_shape=None, + per_channel_quant=False, + ) + + self.fused_experts = FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + return True + def forward_cuda( self, layer: torch.nn.Module, @@ -221,9 +498,12 @@ def forward_cuda( num_expert_group=num_expert_group, custom_routing_function=custom_routing_function, scoring_func=scoring_func, - e_score_correction_bias=e_score_correction_bias) + e_score_correction_bias=e_score_correction_bias, + indices_type=torch.uint32 if self.moe.use_pplx_kernels else None) if self.rocm_aiter_moe_enabled: + assert not apply_router_weight_on_input + assert expert_map is None return self.rocm_aiter_fused_experts( hidden_states=x, w1=layer.w13_weight, @@ -232,18 +512,19 @@ def forward_cuda( topk_ids=topk_ids, activation=activation, apply_router_weight_on_input=apply_router_weight_on_input) - - return fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map) + else: + return self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=global_num_experts, + expert_map=expert_map, + ) def forward_cpu( self, @@ -399,6 +680,45 @@ def determine_expert_map( return (local_num_experts, expert_map) +def _construct_prepare_finalize( + moe: MoEConfig, quant_config: Optional[QuantizationConfig] +) -> Optional[FusedMoEPrepareAndFinalize]: + max_num_tokens = MOE_DP_CHUNK_SIZE + world_size = moe.ep_size + dp_size = moe.ep_size // moe.dp_size # dp_size actually means TP. + rank = moe.ep_rank + + if moe.use_pplx_kernels: + logger.debug("using PplxPrepareAndFinalize") + + all_to_all = get_all_to_all( + max_num_tokens=max_num_tokens, + num_experts=moe.num_experts, + experts_per_token=moe.experts_per_token, # topk + rank=rank, + world_size=world_size, + dp_size=dp_size, + hidden_dim=moe.hidden_dim, + hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize, + # For blocked per token: set to + # ceil_div(hidden_dim, block_size) * sizeof(float32) + # For per-token: set to sizeof(float32) + hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else + ((moe.hidden_dim + moe.block_size - 1) // + moe.block_size * torch.float32.itemsize))) + + return PplxPrepareAndFinalize( + all_to_all, + max_num_tokens=max_num_tokens, + world_size=world_size, + rank=rank, + dp_size=dp_size, + quant_dtype=moe.in_dtype, + ) + + return None + + class FusedMoE(torch.nn.Module): """FusedMoE layer for MoE models. @@ -449,21 +769,16 @@ def __init__( params_dtype = torch.get_default_dtype() self.params_dtype = params_dtype - # Note: here we guard against accessing the TP and DP groups when - # uninitialized (this happens when testing) - self.tp_size = (tp_size if tp_size is not None else - get_tensor_model_parallel_world_size()) - tp_rank = 0 if self.tp_size == 1 else get_tensor_model_parallel_rank() - self.dp_size = (dp_size - if dp_size is not None else get_dp_group().world_size) - self.dp_rank = (0 - if self.dp_size == 1 else get_dp_group().rank_in_group) - self.global_num_experts = num_experts - - # Use expert parallelism instead of tensor parallelism? vllm_config = get_current_vllm_config() - use_ep = (vllm_config.parallel_config.enable_expert_parallel - and self.tp_size * self.dp_size > 1) + self.moe_parallel_config: FusedMoEParallelConfig = ( + FusedMoEParallelConfig.make( + tp_size_=(tp_size if tp_size is not None else + get_tensor_model_parallel_world_size()), + dp_size_=(dp_size if dp_size is not None else + get_dp_group().world_size), + vllm_parallel_config=vllm_config.parallel_config)) + + self.global_num_experts = num_experts # For smuggling this layer into the fused moe custom op self.use_direct_call = self.dp_size == 1 @@ -474,28 +789,17 @@ def __init__( compilation_config.static_forward_context[prefix] = self self.layer_name = prefix - if use_ep: - # Set TP size to 1 to adjust for EP and adjust EP size and rank - # for DP attention. - self.ep_rank = tp_rank + self.tp_size * self.dp_rank - self.tp_rank = 0 - self.ep_size = self.tp_size * self.dp_size - self.tp_size = 1 - + # Determine expert maps + if self.use_ep: self.local_num_experts, self.expert_map = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, global_num_experts=self.global_num_experts) else: - # Adjust TP size for DP attention - self.tp_rank = tp_rank + self.tp_size * self.dp_rank - self.ep_rank = 0 - self.tp_size = self.tp_size * self.dp_size - self.ep_size = 1 - self.local_num_experts = self.global_num_experts - self.expert_map = None + self.local_num_experts, self.expert_map = (self.global_num_experts, + None) + self.top_k = top_k - self.global_num_experts = num_experts assert intermediate_size % self.tp_size == 0 self.hidden_size = hidden_size @@ -520,14 +824,40 @@ def __init__( from vllm_hpu_extension.ops import DynamicFusedMOE self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts) + moe = MoEConfig( + num_experts=self.global_num_experts, + experts_per_token=top_k, + hidden_dim=hidden_size, + num_local_experts=self.local_num_experts, + moe_parallel_config=self.moe_parallel_config, + # TODO (bnell): this needs to be fixed for quantized types. + in_dtype=params_dtype, + ) + # Note: get_quant_method will look at the layer's local_num_experts # for heuristic purposes, so it must be initialized first. + quant_method: Optional[QuantizeMethodBase] = None + if quant_config is None: - self.quant_method: Optional[QuantizeMethodBase] = ( - UnquantizedFusedMoEMethod()) + quant_method = UnquantizedFusedMoEMethod(moe) + prepare_finalize = _construct_prepare_finalize(moe, quant_config) else: - self.quant_method = quant_config.get_quant_method(self, prefix) - assert self.quant_method is not None + quant_method = quant_config.get_quant_method(self, prefix) + # No pplx for quantized types yet. + prepare_finalize = None + + assert quant_method is not None + assert isinstance(quant_method, FusedMoEMethodBase) + self.quant_method = quant_method + + if prepare_finalize is not None: + world_size = moe.ep_size + dp_size = int(moe.ep_size // moe.dp_size) + success = self.quant_method.set_prepare_finalize( + dp_size, world_size, prepare_finalize) + if not success: + logger.warning("DP+EP not supported for %s.", + type(self.quant_method)) moe_quant_params = { "num_experts": self.local_num_experts, @@ -546,6 +876,38 @@ def __init__( self.quant_method.create_weights(layer=self, **moe_quant_params) + @property + def tp_size(self): + return self.moe_parallel_config.tp_size + + @property + def dp_size(self): + return self.moe_parallel_config.dp_size + + @property + def ep_size(self): + return self.moe_parallel_config.ep_size + + @property + def tp_rank(self): + return self.moe_parallel_config.tp_rank + + @property + def dp_rank(self): + return self.moe_parallel_config.dp_rank + + @property + def ep_rank(self): + return self.moe_parallel_config.ep_rank + + @property + def use_ep(self): + return self.moe_parallel_config.use_ep + + @property + def use_pplx_kernels(self): + return self.moe_parallel_config.use_pplx_kernels + def _load_per_tensor_weight_scale(self, shard_id: str, param: torch.nn.Parameter, loaded_weight: torch.Tensor, @@ -830,7 +1192,8 @@ def select_experts(hidden_states: torch.Tensor, num_expert_group: Optional[int] = None, custom_routing_function: Optional[Callable] = None, scoring_func: str = "softmax", - e_score_correction_bias: Optional[torch.Tensor] = None): + e_score_correction_bias: Optional[torch.Tensor] = None, + indices_type: Optional[torch.dtype] = None): from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk # DeekSeekv2 uses grouped_top_k @@ -846,21 +1209,52 @@ def select_experts(hidden_states: torch.Tensor, topk_group=topk_group, scoring_func=scoring_func, e_score_correction_bias=e_score_correction_bias) + if indices_type is not None: + topk_ids = topk_ids.to(dtype=indices_type) elif custom_routing_function is None: topk_weights, topk_ids, token_expert_indices = fused_topk( hidden_states=hidden_states, gating_output=router_logits, topk=top_k, - renormalize=renormalize) + renormalize=renormalize, + indices_type=indices_type, + ) else: topk_weights, topk_ids = custom_routing_function( hidden_states=hidden_states, gating_output=router_logits, topk=top_k, renormalize=renormalize) + if indices_type is not None: + topk_ids = topk_ids.to(dtype=indices_type) return topk_weights, topk_ids + def must_reduce_shared_expert_outputs(self) -> bool: + """ + The shared_experts are typically computed using the RowParallelLinear + layer. The result of this function is typically used as + the reduce_results argument to the module. + When just tensor-parallel is used, it is not required to reduce + the shared_experts results immediately. Instead we reduce at the + once at the end of the MoE op. (Refer to DeepSeekV2MoE module) + With EP and the pplx kernels - this is no longer viable as all + GPU ranks in DP, produce the complete set of hidden_states. + Therefore it is required that we reduce the shared_experts output + early. + """ + return self.use_pplx_kernels + + def maybe_all_reduce_tensor_model_parallel( + self, final_hidden_states: torch.Tensor): + """ + The pplx combine kernel reduces across GPU ranks by default. + """ + if self.use_pplx_kernels: + return final_hidden_states + else: + return tensor_model_parallel_all_reduce(final_hidden_states) + def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): if self.use_direct_call: @@ -869,9 +1263,62 @@ def forward(self, hidden_states: torch.Tensor, return torch.ops.vllm.moe_forward(hidden_states, router_logits, self.layer_name) + def forward_impl_chunked(self, full_hidden_states: torch.Tensor, + full_router_logits: torch.Tensor): + + full_final_hidden_states = torch.empty_like(full_hidden_states) + + def process_chunk(chunk_start, chunk_end, skip_result_store=False): + hidden_states = full_hidden_states[chunk_start:chunk_end, :] + router_logits = full_router_logits[chunk_start:chunk_end, :] + + # Matrix multiply. + final_hidden_states = self.quant_method.apply( + layer=self, + x=hidden_states, + router_logits=router_logits, + top_k=self.top_k, + renormalize=self.renormalize, + use_grouped_topk=self.use_grouped_topk, + global_num_experts=self.global_num_experts, + expert_map=self.expert_map, + topk_group=self.topk_group, + num_expert_group=self.num_expert_group, + custom_routing_function=self.custom_routing_function, + scoring_func=self.scoring_func, + e_score_correction_bias=self.e_score_correction_bias, + activation=self.activation, + ) + + if not skip_result_store: + full_final_hidden_states[chunk_start:chunk_end, :].copy_( + final_hidden_states) + + ctx = get_forward_context() + max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu + moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE + + num_tokens = full_hidden_states.size(0) + for chunk_start_ in range(0, max_tokens_across_dp, + moe_dp_chunk_size_per_rank): + chunk_start = chunk_start_ + chunk_end = min(chunk_start + moe_dp_chunk_size_per_rank, + max_tokens_across_dp) + # clamp start and end + chunk_start = min(chunk_start, num_tokens - 1) + chunk_end = min(chunk_end, num_tokens) + + process_chunk(chunk_start, + chunk_end, + skip_result_store=chunk_start_ >= num_tokens) + + return full_final_hidden_states + def forward_impl(self, hidden_states: torch.Tensor, router_logits: torch.Tensor): assert self.quant_method is not None + if self.moe_parallel_config.use_pplx_kernels: + return self.forward_impl_chunked(hidden_states, router_logits) if self.dp_size > 1: hidden_states, router_logits = get_ep_group().dispatch( diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py new file mode 100644 index 000000000000..7d3ddf8f14c4 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -0,0 +1,364 @@ +# SPDX-License-Identifier: Apache-2.0 +from abc import ABC, abstractmethod +from typing import Optional + +import torch + +# +# This file defines a set of base classes used to make MoE kernels more modular. +# The goal is to be able to utilize different communication mechanisms with +# any fused MoE kernel without needing to have combinatoric implementations. +# +# The fused moe kernels are broken down into the following components: +# +# [Router] → [Quantize-Dispatch] → [Permute-Experts-Unpermute] → [Combine] +# +# Each component will be independent of the others except for +# [Quantize-Dispatch] and `[Combine] (see below). The components can then be +# mixed and matched with so that DP+EP can be supported easily for multiple +# MoE kernel implementations. +# +# The following main classes are defined: +# * FusedMoEPrepareAndFinalize - an abstract base class for preparation of MoE +# inputs (e.g. quantization, distribution) and finalization of Moe outputs. +# The prepare method must take care of any needed quantization and the +# finalize method must apply weights and do the final reduction of the output. +# * FusedMoEPermuteExpertsUnpermute - an abstract base class for the main fused +# MoE operation. One important feature to note is that this class does not +# apply topk weights or reduce the final output. +# * FusedMoEModularKernel - an interface class that combines a +# FusedMoEPrepareAndFinalize and a FusedMoEPermuteExpertsUnpermute to +# provide the standard fused MoE kernel interface. +# +# [Quantize-Prepare] and [Finalize] functionality are bundled into a single +# class `FusedMoEPrepareAndFinalize` since they could use collective +# communication mechanisms that need to be consistent. +# + + +def _moe_problem_size( + a1: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, +) -> tuple[int, int, int, int, int]: + """ + Extract the MoE problem size from the given tensor arguments: + - a: The hidden states, input to the MoE layer. + - w1: The first set of expert weights. + - w2: The second set of expert weights. + - topk_ids: The topk ids. + + Note: extracting the problem shape from the weight and activation tensors is + not obvious. It needs to be done this way specifically due to subtle issues + with particular kernels, e.g. the int4 kernels divide the trailing dimension + by two, so it's not "correct" to extract N or K from the trailing dimension + of w1 or w2. Similarly, some kernels transpose the weights, so this needs + to be kept in mind. + """ + assert w1.dim() == 3 and w2.dim() == 3 + E, N, _ = w1.size() + K = w2.size(1) + + if a1.dim() == 2: + # Make sure we are using the correct a1 (pre-permute). + assert topk_ids.size(0) == a1.size(0), \ + f"{topk_ids.size(0)} != {a1.size(0)}" + M = a1.size(0) + else: + assert a1.dim() == 3 + assert a1.size(0) == E, f"{a1.size(0)} == {E}" + M = a1.size(1) # This is max_num_tokens + + assert topk_ids.dim() == 2 + topk = topk_ids.size(1) + + return E, M, N, K, topk + + +class FusedMoEPrepareAndFinalize(ABC): + """ + An abstract base class for the [Quantize-Prepare] and [Finalize] steps + described above. + """ + + @abstractmethod + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform any quantization (and/or) dispatching needed + for this kernel. + - a1: The (unquantized) input to the MoE layer. + - a1_scale: Optional scales for a1 + - a2_scale: Optional scales for the second MoE gemm. Required to make + sure the quantization is consistent for both gemms. + - topk_ids: The topk ids. + - topk_weights: The topk weights. + - num_experts: The total number of experts in the global expert space. + - expert_map: A tensor mapping expert indices from the global expert + space to the local expert space of the expert parallel shard. + - apply_router_weight_on_input: When True, apply the weights to the + activations, before quantization + dispatching. + + Returns a tuple of: + - quantized + dispatched a. + - quantized + dispatched a1_scales. + """ + raise NotImplementedError + + @abstractmethod + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> None: + """ + Perform any combine plus apply weights and perform a reduction on the + fused experts output. + - output: The output tensor, written in place. Must be (M, K) shape. + - fused_expert_output: The unweighted, unreduced output of the fused + experts, it will have (M, topk, K) shape. + - topk_weights: The weights to be applied to the fused_experts_output. + - topk_ids: The topk_ids. + - apply_router_weight_on_input: When False, apply the weights to + fused_expert_output. + """ + raise NotImplementedError + + +class FusedMoEPermuteExpertsUnpermute(ABC): + """ + An abstract base class for the [Permute-Experts-Unpermute] step described + above. + """ + + @abstractmethod + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + """ + Compute the number of elements for the temporary outputs of the two + gemms and activation in the fused expert function. Since the + gemms are independent, the workspace for the first gemm can be shared + with the workspace for the last gemm. + + Returns a tuple of: + - Number of workspace13 elements: must be large enough to hold the + result of either expert gemm. + - Number of workspace2 elements: must be large enough to hold the + result of the activation function. + - Workspace type: The dtype to use for the workspace tensors. + """ + raise NotImplementedError + + def activation(self, activation: str, output: torch.Tensor, + input: torch.Tensor) -> None: + assert output.size(-1) * 2 == input.size(-1) + if activation == "silu": + torch.ops._C.silu_and_mul(output, input) + elif activation == "gelu": + torch.ops._C.gelu_and_mul(output, input) + else: + raise ValueError(f"Unsupported FusedMoe activation: {activation}") + + @abstractmethod + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + """ + This function computes the intermediate result of a Mixture of Experts + (MoE) layer using two sets of weights, w1 and w2. + + Parameters: + - hidden_states: (torch.Tensor): The (quantized) input tensor to the MoE + layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - topk_ids (torch.Tensor): A map of row to expert id. + - activation (str): The activation function to apply after the first + MoE layer. + - global_num_experts (int): The total number of experts in the global + expert space. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1. + - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2. + - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for + w1. + - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for + w2. + - a1q_scale (Optional[torch.Tensor]): Optional quantized scale to be + used for a1. + - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2. + - workspace13 (torch.Tensor): A scratch tensor used for gemm outputs + must be large enough to hold output of either MoE gemm. + - workspace2 (torch.Tensor): A scratch tensor used for the activation + function. + - expert_num_tokens: An optional tensor containing the number of tokens + assigned to each expert when using batched experts format input. + + Returns: + - torch.Tensor: The unweighted, unreduced output tensor + """ + raise NotImplementedError + + +class FusedMoEModularKernel(torch.nn.Module): + """ + This class combines a FusedMoEPrepareAndFinalize instance and + a FusedMoEPermuteExpertsUnpermute to provide an interface that + is compatible with the `fused_experts` function in fused_moe.py. + + It takes care of managing any required scratch space. + + Note: Instances of this class should only be used for a single model + layer due to any layer specific state that may be used by the component + objects. + """ + + def __init__( + self, + prepare_finalize: FusedMoEPrepareAndFinalize, + fused_experts: FusedMoEPermuteExpertsUnpermute, + ): + super().__init__() + self.prepare_finalize = prepare_finalize + self.fused_experts = fused_experts + + def forward( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + inplace: bool = False, + activation: str = "silu", + global_num_experts: int = -1, + expert_map: Optional[torch.Tensor] = None, + w1_scale: Optional[torch.Tensor] = None, + w2_scale: Optional[torch.Tensor] = None, + w1_zp: Optional[torch.Tensor] = None, + w2_zp: Optional[torch.Tensor] = None, + a1_scale: Optional[torch.Tensor] = None, + a2_scale: Optional[torch.Tensor] = None, + apply_router_weight_on_input: bool = False, + ) -> torch.Tensor: + """ + This function computes a Mixture of Experts (MoE) layer using two sets + of weights, w1 and w2, and top-k gating mechanism. + + Parameters: + - hidden_states: (torch.Tensor): The input tensor to the MoE layer. + - w1 (torch.Tensor): The first set of expert weights. + - w2 (torch.Tensor): The second set of expert weights. + - topk_weights (torch.Tensor): The topk weights applied at the end of + the layer. + - topk_ids (torch.Tensor): A map of row to expert id. + - inplace (bool): If True, perform the operation in-place. + Defaults to False. + - activation (str): The activation function to apply after the first + MoE layer. + - global_num_experts (int): The total number of experts in the global + expert space. + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert + parallel shard. + - w1_scale (Optional[torch.Tensor]): Optional scale to be used for w1. + - w2_scale (Optional[torch.Tensor]): Optional scale to be used for w2. + - w1_zp (Optional[torch.Tensor]): Optional zero points to be used for + w1. + - w2_zp (Optional[torch.Tensor]): Optional zero points to be used for + w2. + - a1_scale (Optional[torch.Tensor]): Optional scale to be used for a1. + - a2_scale (Optional[torch.Tensor]): Optional scale to be used for a2. + - apply_router_weight_on_input (bool): When true, the topk weights are + applied directly on the inputs. This is only applicable when topk is + 1. + + Returns: + - torch.Tensor: The output tensor after applying the MoE layer. + """ + a1 = hidden_states + E, M, N, K, top_k = _moe_problem_size(a1, w1, w2, topk_ids) + + if global_num_experts == -1: + global_num_experts = E + + output = a1 if inplace else torch.zeros_like(a1) + + workspace13_shape, workspace2_shape, workspace_dtype = ( + self.fused_experts.workspace_shapes(a1, M, N, K, top_k, + global_num_experts)) + + # We can reuse the memory between cache1 and cache3 because by the time + # we need cache3, we're done with cache1 + workspace13 = torch.zeros(workspace13_shape, + device=a1.device, + dtype=workspace_dtype) + workspace2 = torch.zeros(workspace2_shape, + device=a1.device, + dtype=workspace_dtype) + + a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare( + a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts, + expert_map, apply_router_weight_on_input) + + fused_out = self.fused_experts.apply( + a1q, + w1, + w2, + topk_ids, + activation=activation, + global_num_experts=global_num_experts, + expert_map=expert_map, + w1_scale=w1_scale, + w2_scale=w2_scale, + w1_zp=w1_zp, + w2_zp=w2_zp, + a1q_scale=a1q_scale, + a2_scale=a2_scale, + workspace13=workspace13, + workspace2=workspace2, + expert_num_tokens=expert_num_tokens, + ) + + self.prepare_finalize.finalize(output, fused_out, topk_weights, + topk_ids, apply_router_weight_on_input) + + return output diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py index 90cb04084809..270e7cf1298a 100644 --- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py +++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py @@ -3,6 +3,74 @@ import torch +from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe.moe_align_block_size import ( + moe_align_block_size) +from vllm.model_executor.layers.fused_moe.utils import _fp8_perm + + +def _moe_permute( + curr_hidden_states: torch.Tensor, + a1q_scale: Optional[torch.Tensor], + curr_topk_ids: torch.Tensor, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + block_m: int, +) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor, torch.Tensor, + Optional[torch.Tensor]]: + """ + Determine the sorted_token_ids, expert_ids for the given problem size. + Permute the hidden states and scales according to `sorted_token_ids`. + """ + top_k_num = curr_topk_ids.size(1) + + tokens_in_chunk = curr_hidden_states.sizze(0) + + sorted_token_ids, expert_ids, num_tokens_post_padded = ( + moe_align_block_size(curr_topk_ids, + block_m, + global_num_experts, + expert_map, + pad_sorted_ids=True)) + + inv_perm: Optional[torch.Tensor] = None + + num_tokens = top_k_num * tokens_in_chunk + sorted_token_ids = sorted_token_ids.clamp(max=num_tokens - 1) + expert_ids = torch.repeat_interleave(expert_ids, block_m, dim=0) + inv_perm = torch.argsort(sorted_token_ids)[:num_tokens] + + # Permute according to sorted token ids. + curr_hidden_states = _fp8_perm(curr_hidden_states, + sorted_token_ids // top_k_num) + + if a1q_scale is not None: + a1q_scale = a1q_scale[sorted_token_ids // top_k_num] + + return (curr_hidden_states, a1q_scale, sorted_token_ids, expert_ids, + inv_perm) + + +def _moe_unpermute_and_reduce( + out: torch.Tensor, + curr_hidden: torch.Tensor, + inv_perm: Optional[torch.Tensor], + topk_weight: torch.Tensor, + apply_router_weight_on_input: bool, +) -> None: + """ + Unpermute the final result and apply topk_weights, then perform the final + reduction on the hidden states. + """ + M, topk = topk_weight.size() + K = curr_hidden.size(-1) + if inv_perm is not None: + curr_hidden = curr_hidden[inv_perm, ...] + curr_hidden = curr_hidden.view(-1, topk, K) + if not apply_router_weight_on_input: + curr_hidden.mul_(topk_weight.view(M, -1, 1)) + ops.moe_sum(curr_hidden, out) + def moe_permute( hidden_states: torch.Tensor, @@ -17,21 +85,21 @@ def moe_permute( fill_invalid_expert: int = -1 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ - This function expands and permutes activation to gather uncontinuous tokens + This function expands and permutes activation to gather uncontinuous tokens for each expert. Parameters: - - hidden_states (torch.Tensor): The input tensor to the MoE layer. + - hidden_states (torch.Tensor): The input tensor to the MoE layer. - topk_weights (torch.Tensor): topk expert route weight for each token. - topk_ids (torch.Tensor): topk expert route id for each token. - token_expert_indices (torch.Tensor): indice for expanded hidden. - topk (int): The number of top-k experts to select. - n_expert (int): The number of expert. - n_local_expert (int): The number of expert in current EP rank. - - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices - from the global expert space to the local expert space of the expert + - expert_map (Optional[torch.Tensor]): A tensor mapping expert indices + from the global expert space to the local expert space of the expert parallel shard. - align_block_size (Optional[int]): align group gemm block size for deepgemm - - fill_invalid_expert(int): fill expert id in m_indices for invalid expert + - fill_invalid_expert(int): fill expert id in m_indices for invalid expert to workaround DeepGemm unsupported -1 in m_indices Returns: - permuted_hidden_states (torch.Tensor): permuted activation. @@ -39,10 +107,10 @@ def moe_permute( of each expert for standard grouped gemm. if enable 'align_block_size' expert_first_token_offset will align up to 'align_block_size'. - src_row_id2dst_row_id_map (torch.Tensor): idx map for moe_unpermute. - - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records + - m_indices: m_indices for grouped gemm in deepgemm,`m_indices[i]` records the group which the j-th row of the LHS belong to.` """ - n_token, n_hidden = hidden_states.shape + n_token, n_hidden = hidden_states.size() assert (n_hidden * hidden_states.element_size() ) % 16 == 0, "permue kernel need hidden dim align to 16B" permuted_row_size = n_token * topk @@ -87,7 +155,7 @@ def moe_unpermute( n_local_expert: int, ) -> torch.Tensor: """ - This function expands and permutes activation to gathering uncontinuous + This function expands and permutes activation to gathering uncontinuous tokens for each expert. Parameters: - permuted_hidden_states (torch.Tensor): permuted activation. @@ -99,10 +167,10 @@ def moe_unpermute( - n_expert (int): The number of expert. - n_local_expert (int): The number of expert in current EP rank. Returns: - - hidden_states (torch.Tensor): The reduced and unpermuted activation - tensor. + - hidden_states (torch.Tensor): The reduced and unpermuted activation + tensor. """ - n_token, n_hidden = topk_weights.shape[0], permuted_hidden_states.shape[-1] + n_token, n_hidden = topk_weights.size(0), permuted_hidden_states.size(-1) assert (n_hidden * permuted_hidden_states.element_size() ) % 16 == 0, "unpermue kernel need hidden dim align to 16B" hidden_states = torch.empty((n_token, n_hidden), diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py new file mode 100644 index 000000000000..b1126b94e45a --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import pplx_kernels as pplx +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input) + + +# Note use: layer.get_all_to_all() to get an AllToAll instance +# The max_num_tokens, world_size and dp_size must be the same +# as the ones used to create the AllToAll. +class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): + + def __init__(self, + a2a: pplx.AllToAll, + max_num_tokens: int, + world_size: int, + rank: int, + dp_size: int, + quant_dtype: Optional[torch.dtype] = None, + block_shape: Optional[list[int]] = None): + super().__init__() + assert max_num_tokens > 0 + self.a2a = a2a + self.block_shape = block_shape + self.max_num_tokens = max_num_tokens + self.world_size = world_size + self.rank = rank + self.dp_size = dp_size + self.quant_dtype = quant_dtype + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + rank_topk_weights: torch.Tensor, + rank_topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + num_tokens = a1.size(0) # M + hidden_dim = a1.size(-1) # K + + assert rank_topk_ids.size(0) == num_tokens + # assert expert_map is None, "NYI" + + # Is this always going to be a1.device? + device = a1.device + + if apply_router_weight_on_input: + topk = rank_topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, ( + "apply_router_weight_on_input is only implemented for topk=1") + a1 = a1 * rank_topk_weights.to(a1.dtype) + + per_act_token = a1_scale.numel() != 1 if a1_scale is not None else ( + a2_scale.numel() != 1 if a2_scale is not None else False) + + a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale, + self.quant_dtype, + per_act_token, + self.block_shape) + + # rem_experts need to be 0 for pplx to work properly. + rem_experts = num_experts % self.world_size + assert rem_experts == 0 + num_local_experts = ((num_experts // self.world_size) + + (1 if self.rank < rem_experts else 0)) + + expert_num_tokens = torch.empty( + num_local_experts, + dtype=torch.int32, + device=device, + ) + + num_dp = self.world_size // self.dp_size + expert_x = torch.empty( + (num_local_experts, self.max_num_tokens * num_dp, hidden_dim), + dtype=a1q.dtype, + device=device, + ) + + expert_x_scale: Optional[torch.Tensor] = None + if a1q.dtype.itemsize == 1: + float32_size = torch.float32.itemsize + block_size = (self.block_shape[0] if self.block_shape is not None + else 1) * float32_size + expert_x_scale = torch.empty( + ( + num_experts, + expert_x.size(1), + (expert_x.size(2) + block_size - 1) // block_size, + ), + dtype=torch.float32, + device=device, + ) + + # This argument is optional, defaults to indices.size(0) + # There's not much point setting this unless it is != indices.size(0) + bound_m: Optional[torch.Tensor] = None + + self.a2a.dispatch( + out_expert_num_tokens=expert_num_tokens, + out_expert_x=expert_x, + out_expert_x_scale=expert_x_scale, + dp_x=a1q, + dp_x_scale=a1q_scale, + indices=rank_topk_ids, + bound_m=bound_m, + ) + + return expert_x, expert_x_scale, expert_num_tokens + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> None: + num_tokens = output.size(0) # M + # This argument is optional + # There's not much point setting this unless it is != topk_ids.size(0) + bound_m: Optional[torch.Tensor] = None + + assert topk_ids.size(0) == num_tokens, ( + f"{topk_ids.size(0)} == {num_tokens}") + assert output.size(0) <= self.max_num_tokens, ( + f"{output.size(0)} <= {self.max_num_tokens}") + assert output.size(1) == fused_expert_output.size(-1) + + # Set weights to 1 if we did them in dispatch. This is hacky. + if apply_router_weight_on_input: + topk_weights = torch.ones_like(topk_weights) + + self.a2a.combine(out_tokens=output, + indices=topk_ids, + weights=topk_weights, + expert_y=fused_expert_output, + bound_m=bound_m) diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py new file mode 100644 index 000000000000..98f98b3bd20b --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/prepare_finalize.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import ( + _moe_unpermute_and_reduce) +from vllm.model_executor.layers.fused_moe.utils import ( + moe_kernel_quantize_input) + + +class MoEPrepareAndFinalizeNoEP(mk.FusedMoEPrepareAndFinalize): + + def __init__( + self, + quant_dtype: Optional[torch.dtype] = None, + per_channel_quant: bool = False, + block_shape: Optional[list[int]] = None, + ): + super().__init__() + self.per_channel_quant = per_channel_quant + self.block_shape = block_shape + self.quant_dtype = quant_dtype + + def prepare( + self, + a1: torch.Tensor, + a1_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + num_experts: int, + expert_map: Optional[torch.Tensor], + apply_router_weight_on_input: bool = False, + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]: + if apply_router_weight_on_input: + topk = topk_ids.size(1) + # TODO: this only works for topK=1, will need to update for topK>1 + assert topk == 1, \ + "apply_router_weight_on_input is only implemented for topk=1" + a1.mul_(topk_weights.to(a1.dtype)) + + a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale, + self.quant_dtype, + self.per_channel_quant, + self.block_shape) + + return a1q, a1q_scale, None + + def finalize( + self, + output: torch.Tensor, + fused_expert_output: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + apply_router_weight_on_input: bool, + ) -> None: + _moe_unpermute_and_reduce(output, fused_expert_output, None, + topk_weights, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py new file mode 100644 index 000000000000..2cfe373140bb --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py @@ -0,0 +1,112 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import torch + +import vllm.model_executor.layers.fused_moe.modular_kernel as mk +from vllm.model_executor.layers.fused_moe.deep_gemm_moe import ( + DeepGemmExperts, _valid_deep_gemm, _valid_deep_gemm_shape) +from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts + + +class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute): + + def __init__(self, + use_fp8_w8a8: bool = False, + use_int8_w8a8: bool = False, + use_int8_w8a16: bool = False, + use_int4_w4a16: bool = False, + per_channel_quant: bool = False, + block_shape: Optional[list[int]] = None, + block_m: Optional[int] = None, + allow_deep_gemm: bool = False): + super().__init__() + self.triton_expert = TritonExperts(use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a16=use_int8_w8a16, + per_channel_quant=per_channel_quant, + block_shape=block_shape, + block_m=block_m) + self.deep_gemm_expert = DeepGemmExperts() + self.allow_deep_gemm = allow_deep_gemm + self.use_fp8_w8a8 = use_fp8_w8a8 + + def workspace_shapes( + self, + a: torch.Tensor, + M: int, + N: int, + K: int, + topk: int, + num_experts: int, + ) -> tuple[int, int, torch.dtype]: + # Note: the deep gemm workspaces are strictly larger than the triton + # workspaces so we can be pessimistic here and allocate for DeepGemm + # even if we fall back to triton later, e.g. if expert maps are set. + if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K): + return self.deep_gemm_expert.workspace_shapes( + a, M, N, K, topk, num_experts) + else: + return self.triton_expert.workspace_shapes(a, M, N, K, topk, + num_experts) + + def apply( + self, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: Optional[torch.Tensor], + w1_scale: Optional[torch.Tensor], + w2_scale: Optional[torch.Tensor], + w1_zp: Optional[torch.Tensor], + w2_zp: Optional[torch.Tensor], + a1q_scale: Optional[torch.Tensor], + a2_scale: Optional[torch.Tensor], + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: Optional[torch.Tensor], + ) -> torch.Tensor: + N = w1.size(1) + if (self.allow_deep_gemm and self.use_fp8_w8a8 and N > 512 + and _valid_deep_gemm(hidden_states, w1, w2, expert_map)): + return self.deep_gemm_expert.apply( + hidden_states, + w1, + w2, + topk_ids, + activation, + global_num_experts, + expert_map, + w1_scale, + w2_scale, + w1_zp, + w2_zp, + a1q_scale, + a2_scale, + workspace13, + workspace2, + expert_num_tokens, + ) + else: + return self.triton_expert.apply( + hidden_states, + w1, + w2, + topk_ids, + activation, + global_num_experts, + expert_map, + w1_scale, + w2_scale, + w1_zp, + w2_zp, + a1q_scale, + a2_scale, + workspace13, + workspace2, + expert_num_tokens, + ) diff --git a/vllm/model_executor/layers/fused_moe/utils.py b/vllm/model_executor/layers/fused_moe/utils.py index 1acbba2056b0..d9d2520e18b3 100644 --- a/vllm/model_executor/layers/fused_moe/utils.py +++ b/vllm/model_executor/layers/fused_moe/utils.py @@ -7,6 +7,8 @@ from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8) +from vllm.model_executor.layers.quantization.utils.int8_utils import ( + per_token_group_quant_int8, per_token_quant_int8) from vllm.utils import cdiv @@ -15,34 +17,81 @@ def _resize_cache(x: torch.Tensor, v: tuple[int, ...]) -> torch.Tensor: Shrink the given tensor and apply the given view to it. This is used to resize the intermediate fused_moe caches. """ - assert prod(v) <= x.numel() + assert prod( + v) <= x.numel(), f"{prod(v)} <= {x.numel()}" # CUDAGRAPH unfriendly? return x.flatten()[:prod(v)].view(*v) def _fp8_quantize( A: torch.Tensor, A_scale: Optional[torch.Tensor], - block_shape: Optional[list[int]], + per_act_token: bool, + block_shape: Optional[list[int]] = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Perform fp8 quantization on the inputs. If a block_shape is provided, the output will be blocked. """ if block_shape is None: - A, A_scale = ops.scaled_fp8_quant(A, A_scale) + A, A_scale = ops.scaled_fp8_quant( + A, A_scale, use_per_token_if_dynamic=per_act_token) else: assert len(block_shape) == 2 _, block_k = block_shape[0], block_shape[1] A, A_scale = per_token_group_quant_fp8(A, block_k) - assert cdiv(A.shape[-1], block_k) == A_scale.shape[-1] + assert cdiv(A.size(-1), block_k) == A_scale.size(-1) + return A, A_scale +def _int8_quantize( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + per_act_token: bool, + block_shape: Optional[list[int]] = None, +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Perform int8 quantization on the inputs. If a block_shape + is provided, the output will be blocked. + """ + + # If weights are per-channel (per_channel_quant=True), then + # activations apply per-token quantization. Otherwise, assume + # activation tensor-wise fp8/int8 quantization, dynamic or static + if block_shape is None: + assert per_act_token, \ + "int8 quantization only supports block or channel-wise" + A, A_scale = per_token_quant_int8(A) + else: + assert len(block_shape) == 2 + _, block_k = block_shape[0], block_shape[1] + A, A_scale = per_token_group_quant_int8(A, block_k) + assert cdiv(A.size(-1), block_k) == A_scale.size(-1) + + return A, A_scale + + +def moe_kernel_quantize_input( + A: torch.Tensor, + A_scale: Optional[torch.Tensor], + qtype: Optional[torch.dtype], + per_channel_quant: bool, + block_shape: Optional[list[int]] = None, +) -> tuple[torch.Tensor, Optional[torch.Tensor]]: + if qtype == torch.float8_e4m3fn: + return _fp8_quantize(A, A_scale, per_channel_quant, block_shape) + elif qtype == torch.int8: + return _int8_quantize(A, A_scale, per_channel_quant, block_shape) + else: + assert A_scale is None + return A, A_scale + + def _fp8_perm(m: torch.Tensor, idx: torch.Tensor) -> torch.Tensor: """ A permutation routine that works on fp8 types. """ - if torch.is_floating_point(m) and torch.finfo(m.dtype).bits == 8: + if torch.is_floating_point(m) and m.dtype.itemsize == 1: return m.view(dtype=torch.uint8)[idx, ...].view(dtype=m.dtype) else: return m[idx, ...] diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index cfd398c07fb9..f4cdc3db1a0d 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 +import functools import importlib.util from typing import Any, Callable, Optional @@ -9,6 +10,7 @@ from torch.nn.parameter import Parameter import vllm.envs as envs +import vllm.model_executor.layers.fused_moe.modular_kernel as mk from vllm import _custom_ops as ops from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger @@ -434,6 +436,7 @@ class Fp8MoEMethod(FusedMoEMethodBase): """ def __init__(self, quant_config: Fp8Config): + from vllm.model_executor.layers.fused_moe import fused_experts self.quant_config = quant_config self.block_quant = self.quant_config.weight_block_size is not None @@ -458,6 +461,11 @@ def __init__(self, quant_config: Fp8Config): logger.warning_once( "DeepGemm not supported on the current platform.") + self.fused_experts = functools.partial( + fused_experts, + block_shape=self.quant_config.weight_block_size, + allow_deep_gemm=self.allow_deep_gemm) + def create_weights(self, layer: Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, params_dtype: torch.dtype, **extra_weight_attrs): @@ -783,6 +791,31 @@ def process_weights_after_loading(self, layer: Module) -> None: del layer.w13_input_scale del layer.w2_input_scale + def set_prepare_finalize( + self, + dp_size: int, + world_size: int, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + ) -> bool: + from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( + TritonOrDeepGemmExperts) + + if self.use_marlin or self.rocm_aiter_moe_enabled: + return False + + experts = TritonOrDeepGemmExperts( + use_fp8_w8a8=True, + block_shape=self.quant_config.weight_block_size, + allow_deep_gemm=self.allow_deep_gemm, + ) + + self.fused_experts = mk.FusedMoEModularKernel( + prepare_finalize, + experts, + ) + + return True + def apply( self, layer: torch.nn.Module, @@ -801,10 +834,6 @@ def apply( apply_router_weight_on_input: bool = False, activation: str = "silu", ) -> torch.Tensor: - from vllm.model_executor.layers.fused_moe import fused_experts - from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - rocm_aiter_fused_experts) - topk_weights, topk_ids = FusedMoE.select_experts( hidden_states=x, router_logits=router_logits, @@ -819,6 +848,8 @@ def apply( ) if self.rocm_aiter_moe_enabled: + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 + rocm_aiter_fused_experts) return rocm_aiter_fused_experts( x, layer.w13_weight, @@ -835,8 +866,7 @@ def apply( a1_scale=layer.w13_input_scale, a2_scale=layer.w2_input_scale, block_shape=self.quant_config.weight_block_size) - - if self.use_marlin: + elif self.use_marlin: assert activation == "silu", ( f"{activation} not supported for Marlin MoE.") assert not apply_router_weight_on_input, ( @@ -853,28 +883,26 @@ def apply( quant_type_id=scalar_types.float8_e4m3fn.id, global_num_experts=global_num_experts, expert_map=expert_map) - - return fused_experts( - x, - layer.w13_weight, - layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=activation, - use_fp8_w8a8=True, - global_num_experts=global_num_experts, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, - w1_scale=(layer.w13_weight_scale_inv - if self.block_quant else layer.w13_weight_scale), - w2_scale=(layer.w2_weight_scale_inv - if self.block_quant else layer.w2_weight_scale), - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - block_shape=self.quant_config.weight_block_size, - allow_deep_gemm=self.allow_deep_gemm, - ) + else: + return self.fused_experts( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=True, + activation=activation, + use_fp8_w8a8=True, + global_num_experts=global_num_experts, + apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=expert_map, + w1_scale=(layer.w13_weight_scale_inv + if self.block_quant else layer.w13_weight_scale), + w2_scale=(layer.w2_weight_scale_inv + if self.block_quant else layer.w2_weight_scale), + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + ) class Fp8KVCacheMethod(BaseKVCacheMethod): diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 9ec245cce189..850fba2604e1 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -79,7 +79,6 @@ def __init__( prefix=prefix, ) self.config = config - self.tp_size = get_tensor_model_parallel_world_size() self.d_model = config.d_model self.intermediate_size = (self.config.ffn_config.ffn_hidden_size // self.tp_size) diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 0366895ef02e..680b7e614dd6 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -31,9 +31,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, ModelConfig, VllmConfig -from vllm.distributed import (get_pp_group, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm @@ -143,7 +141,8 @@ def __init__( intermediate_size=intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, - reduce_results=False, + reduce_results=self.experts.must_reduce_shared_expert_outputs( + ), prefix=f"{prefix}.shared_experts", ) @@ -154,6 +153,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: shared_output = self.shared_experts(hidden_states) # router_logits: (num_tokens, n_experts) router_logits, _ = self.gate(hidden_states) + if hidden_states.dtype != torch.float16: final_hidden_states = self.experts( hidden_states=hidden_states, @@ -171,9 +171,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: # See DeepseekV2DecoderLayer for more details. final_hidden_states = final_hidden_states + shared_output \ * (1. / self.routed_scaling_factor) + if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce( - final_hidden_states) + final_hidden_states = ( + self.experts.maybe_all_reduce_tensor_model_parallel( + final_hidden_states)) return final_hidden_states.view(num_tokens, hidden_dim) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index 0fdc30f36f9b..dfd0804f21cf 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -25,8 +25,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) +from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (QKVParallelLinear, @@ -89,7 +88,7 @@ def __init__(self, quant_config=quant_config, bias=False, prefix=f"{prefix}.shared_expert", - reduce_results=False, # We need to do scatter before reduce + reduce_results=self.experts.must_reduce_shared_expert_outputs(), ) def forward(self, hidden_states): @@ -102,7 +101,8 @@ def forward(self, hidden_states): experts_out = routed_out + shared_out if self.tp_size > 1: - experts_out = tensor_model_parallel_all_reduce(experts_out) + experts_out = self.experts.maybe_all_reduce_tensor_model_parallel( + experts_out) return experts_out diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 14f9f8158940..ae1c146cf3f2 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -33,9 +33,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE @@ -129,7 +127,8 @@ def __init__( intermediate_size=config.shared_expert_intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, - reduce_results=False, + reduce_results=self.experts.must_reduce_shared_expert_outputs( + ), ) else: self.shared_expert = None @@ -156,7 +155,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: if shared_output is not None: final_hidden_states = final_hidden_states + shared_output if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 final_hidden_states) return final_hidden_states.view(orig_shape) diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 51cfa5796187..1fef37a96ea9 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -30,9 +30,7 @@ from vllm.attention import Attention from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig -from vllm.distributed import (get_pp_group, - get_tensor_model_parallel_world_size, - tensor_model_parallel_all_reduce) +from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size from vllm.logger import init_logger from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe import FusedMoE @@ -137,7 +135,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: router_logits=router_logits) final_hidden_states = final_hidden_states if self.tp_size > 1: - final_hidden_states = tensor_model_parallel_all_reduce( + final_hidden_states = self.experts.maybe_all_reduce_tensor_model_parallel( # noqa E501 final_hidden_states) return final_hidden_states.view(orig_shape) diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py index 9163b97c51a0..bdee8b2f821d 100644 --- a/vllm/platforms/cuda.py +++ b/vllm/platforms/cuda.py @@ -158,6 +158,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None: "currently not supported with CUDA Graphs.") vllm_config.model_config.enforce_eager = True compilation_config.use_cudagraph = False + compilation_config.use_inductor = False @classmethod def get_current_memory_usage(cls, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 69fc1ac69ab6..83e181116577 100644 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -865,8 +865,10 @@ def forward( assert output is not None, "Output tensor must be provided." if attn_metadata is None: - # Profiling run. - return output + # The zero fill is required when used with DP + EP + # to ensure all ranks within a DP group compute the + # same expert outputs. + return output.fill_(0) num_actual_toks = attn_metadata.num_actual_tokens diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 5352b1c5a37c..d85701fa93df 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -341,7 +341,8 @@ def init_worker_distributed_environment( distributed_init_method, local_rank) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index 9eea26d85249..25715407ceee 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -265,4 +265,5 @@ def init_tpu_worker_distributed_environment( backend="gloo", ) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 1436a404335a..a92cf1e5a3b3 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -390,7 +390,8 @@ def init_distributed_environment(self) -> None: ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) def get_cache_block_size_bytes(self) -> int: """Return the size in bytes of a single KV cache block. diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 7898c645d66a..42882992f2da 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -416,7 +416,8 @@ def init_worker_distributed_environment( backend='hccl') ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) if torch.distributed.is_initialized(): torch_world_size = torch.distributed.get_world_size() @@ -442,7 +443,8 @@ def init_worker_distributed_environment( torch.distributed.all_reduce(dummy_tensor_hpu) assert dummy_tensor_hpu.item() == parallel_config.world_size ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len, diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 4bb9bea022f9..891ed66599dc 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -76,7 +76,8 @@ def init_device(self) -> None: ) ensure_model_parallel_initialized( self.parallel_config.tensor_parallel_size, - self.parallel_config.pipeline_parallel_size) + self.parallel_config.pipeline_parallel_size, + self.parallel_config.enable_expert_parallel) # Device initialization should happen after initializing the distributed # runtime. diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 17f636765ff9..41546462e5c4 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -530,7 +530,8 @@ def init_worker_distributed_environment( init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank) ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) ensure_kv_transfer_initialized(vllm_config) diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py index 17f533525171..65085f80f97a 100644 --- a/vllm/worker/xpu_worker.py +++ b/vllm/worker/xpu_worker.py @@ -176,7 +176,8 @@ def init_worker_distributed_environment(self) -> None: ensure_model_parallel_initialized( parallel_config.tensor_parallel_size, - parallel_config.pipeline_parallel_size) + parallel_config.pipeline_parallel_size, + parallel_config.enable_expert_parallel) # global all_reduce needed for overall oneccl warm up torch.distributed.all_reduce(torch.zeros(1).xpu()) From 856865008e1a8ffce393901c0245df265b5dfc3f Mon Sep 17 00:00:00 2001 From: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Date: Wed, 14 May 2025 16:49:56 -0400 Subject: [PATCH 03/58] [CI] Disable Failing Tests (#18165) --- tests/spec_decode/e2e/test_eagle_correctness.py | 2 ++ tests/v1/engine/test_engine_core_client.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/spec_decode/e2e/test_eagle_correctness.py b/tests/spec_decode/e2e/test_eagle_correctness.py index eee535a146f4..2814bb6d3773 100644 --- a/tests/spec_decode/e2e/test_eagle_correctness.py +++ b/tests/spec_decode/e2e/test_eagle_correctness.py @@ -178,6 +178,8 @@ def test_eagle_e2e_greedy_correctness_cuda_graph( batch_size, output_len, seed) +# TRACKING: https://github.com/vllm-project/vllm/issues/18166 +@pytest.mark.skip(reason="RE-ENABLE: Failing on main.") @pytest.mark.parametrize( "common_llm_kwargs", [{ diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 452fe1e37e2c..671d74b83b85 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -256,6 +256,8 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): client.shutdown() +# TRACKING: https://github.com/vllm-project/vllm/issues/18167 +@pytest.mark.skip(reason="RE-ENABLE: this test is failing on main.") @pytest.mark.parametrize( "multiprocessing_mode,publisher_config", [(True, "tcp"), (False, "inproc")], From 7e55a34492ad107955a0a57f2d9cbbccd1c3ef89 Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 14 May 2025 21:03:20 +0000 Subject: [PATCH 04/58] Local attention optimization for NIXL Signed-off-by: mgoin --- .../kv_connector/v1/nixl_connector.py | 167 ++++++++++++++++-- 1 file changed, 153 insertions(+), 14 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index abd1ea2bea82..131652ad1cd5 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -88,6 +88,7 @@ class NixlConnector(KVConnectorBase_V1): def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): assert vllm_config.kv_transfer_config is not None + self.vllm_config = vllm_config self.engine_id = vllm_config.kv_transfer_config.engine_id if role == KVConnectorRole.SCHEDULER: @@ -96,7 +97,8 @@ def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): self.connector_worker: Optional[NixlConnectorWorker] = None elif role == KVConnectorRole.WORKER: self.connector_scheduler = None - self.connector_worker = NixlConnectorWorker(str(self.engine_id)) + self.connector_worker = NixlConnectorWorker( + str(self.engine_id), vllm_config) ############################################################ # Scheduler Side Methods @@ -302,7 +304,7 @@ def request_finished( class NixlConnectorWorker: """Implementation of Worker side methods""" - def __init__(self, engine_id: str): + def __init__(self, engine_id: str, vllm_config: VllmConfig): if NixlWrapper is None: logger.error("NIXL is not available") raise RuntimeError("NIXL is not available") @@ -329,6 +331,7 @@ def __init__(self, engine_id: str): # Number of NIXL regions. Currently one region per cache # (so 1 per layer for MLA, otherwise 2 per layer) self.num_regions = 0 + self.num_layers = 0 # nixl_prepped_dlist_handle (int). self.src_xfer_side_handle: int = 0 @@ -355,6 +358,92 @@ def __init__(self, engine_id: str): # Background thread for establishing new connections. self._nixl_handshake_listener_t: Optional[threading.Thread] = None + self.vllm_config = vllm_config + self.block_size = vllm_config.cache_config.block_size + + def _get_layer_index_from_name(self, layer_name: str) -> Optional[int]: + """ + Parses the numerical index from a layer name string. + Example: "model.layers.0.self_attn" -> 0 + """ + parts = layer_name.split('.') + for i, part in enumerate(parts): + if part == "layers" and i + 1 < len(parts): + try: + return int(parts[i + 1]) + except ValueError: + logger.warning( + f"Could not parse layer index from part '{parts[i+1]}' in layer name '{layer_name}'" + ) + return None + logger.debug( + f"Could not find 'layers.X' pattern in layer name '{layer_name}' to extract index." + ) + return None + + def _get_llama4_layer_strategy(self, layer_name: str) -> Optional[int]: + """ + Determines the transfer strategy for a given layer type. + At the moment this is used for Llama 4 models (RoPE vs NoPE). + Returns: + Optional[int]: chunk_size + """ + layer_idx = self._get_layer_index_from_name(layer_name) + if layer_idx is None: + logger.debug( + f"Could not determine layer index for '{layer_name}'. Defaulting to global transfer." + ) + return None + + model_cfg = self.vllm_config.model_config + if not hasattr(model_cfg, 'hf_text_config'): + logger.debug( + "Model config does not have 'hf_text_config'. Defaulting to global transfer for layer %s.", + layer_name) + return None + + model_hf_text_config = model_cfg.hf_text_config + if not hasattr(model_hf_text_config, 'no_rope_layers'): + logger.debug( + "Model hf_text_config does not have 'no_rope_layers'. Defaulting to global transfer for layer %s.", + layer_name) + return None + elif not hasattr(model_hf_text_config, 'attention_chunk_size'): + logger.debug( + "Model hf_text_config does not have 'attention_chunk_size'. Defaulting to global transfer for layer %s.", + layer_name) + return None + + no_rope_layers = model_hf_text_config.no_rope_layers + if not isinstance(no_rope_layers, list) or not no_rope_layers: + logger.debug( + "'no_rope_layers' is not a valid list. Defaulting to global transfer for layer %s.", + layer_name) + return None + + chunk_size = getattr(model_hf_text_config, 'attention_chunk_size', + None) + if not isinstance(chunk_size, int) or chunk_size <= 0: + logger.debug( + "'attention_chunk_size' is not a valid integer. Defaulting to global transfer for layer %s.", + layer_name) + return None + + if not (0 <= layer_idx < len(no_rope_layers)): + logger.debug( + f"Layer index {layer_idx} is out of bounds for 'no_rope_layers' list (len {len(no_rope_layers)}). Defaulting to global transfer for layer %s.", + layer_name) + return None + + # Llama 4 specific logic: + # no_rope_layers[layer_idx] == 0 means NoPE (global) + # Any other value means RoPE (local chunked) + if no_rope_layers[layer_idx] == 0: + return None # Global attention + else: + # Llama 4 RoPE layers have a fixed chunk_size token window + return chunk_size + @staticmethod def _nixl_handshake_listener(metadata: NixlAgentMetadata, ready_event: threading.Event, rank: int): @@ -465,6 +554,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): kv_caches_base_addr.append(base_addr) self.kv_caches_base_addr[self.engine_id] = kv_caches_base_addr self.num_regions = len(caches_data) + self.num_layers = len(self.kv_caches.keys()) descs = self.nixl_wrapper.get_reg_descs(caches_data, "VRAM") logger.debug("Registering descs: %s", caches_data) @@ -694,17 +784,48 @@ def _read_blocks( if num_local_blocks < num_remote_blocks: remote_block_ids = remote_block_ids[-num_local_blocks:] + local_block_descs_ids: list[int] = [] + remote_block_descs_ids: list[int] = [] + + # Iterate through all layers this worker is responsible for + for layer_idx, layer_name in enumerate(self.kv_caches.keys()): + + if chunk_size := self._get_llama4_layer_strategy(layer_name): + # Llama 4 specific logic + logger.debug( + f"Layer {layer_name} is local attention with chunk_size: {chunk_size}" + ) + num_blocks_for_window = math.ceil(chunk_size / self.block_size) + # Get the last num_blocks_for_window blocks + layer_local_block_ids = local_block_ids[ + -num_blocks_for_window:] + layer_remote_block_ids = remote_block_ids[ + -num_blocks_for_window:] + else: + logger.debug(f"Layer {layer_name} is global attention") + # If the layer is not chunked, we just use the + # full block lists (global attention) + layer_local_block_ids = local_block_ids + layer_remote_block_ids = remote_block_ids + + # Get descs ids for the layer. + layer_remote_desc_ids = self._get_block_descs_ids( + dst_engine_id, layer_remote_block_ids, layer_idx) + layer_local_desc_ids = self._get_block_descs_ids( + self.engine_id, layer_local_block_ids, layer_idx) + + remote_block_descs_ids.extend(layer_remote_desc_ids) + local_block_descs_ids.extend(layer_local_desc_ids) + + logger.debug( + f"NIXL READ for {len(local_block_descs_ids)} local descs and {len(remote_block_descs_ids)} remote descs" + ) + assert len(local_block_descs_ids) == len(remote_block_descs_ids) + # Get side handles. local_xfer_side_handle = self.src_xfer_side_handle remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id] - # Get descs ids. - remote_block_descs_ids = self._get_block_descs_ids( - dst_engine_id, remote_block_ids) - local_block_descs_ids = self._get_block_descs_ids( - self.engine_id, local_block_ids) - assert len(local_block_descs_ids) == len(remote_block_descs_ids) - # Prepare transfer with Nixl. handle = self.nixl_wrapper.make_prepped_xfer( "READ", @@ -721,12 +842,30 @@ def _read_blocks( # Use handle to check completion in future step(). self._recving_transfers[request_id].append(handle) - def _get_block_descs_ids(self, engine_id: str, - block_ids: list[int]) -> list[int]: - """Get the descs ids for a set of block ids.""" + def _get_block_descs_ids(self, + engine_id: str, + block_ids: list[int], + layer_idx: Optional[int] = None) -> list[int]: + """ + Get the descs ids for a set of block ids. + If layer_idx is provided, we use the region_ids for the given layer. + Otherwise, we use all regions. + """ + + if layer_idx is None: + region_ids = range(self.num_regions) + else: + assert layer_idx < self.num_layers + if self.num_layers < self.num_regions: + # If we have more regions than layers, we assume that + # the regions are organized as [K, V, K, V, ...] + assert 2 * self.num_layers == self.num_regions + region_ids = [2 * layer_idx, 2 * layer_idx + 1] + else: + # Otherwise, we assume we have MLA + assert self.num_layers == self.num_regions + region_ids = [layer_idx] - # range(1) for MLA, range(2) otherwise. - region_ids = range(self.num_regions) num_blocks = self.dst_num_blocks[engine_id] # Compute the desc ids for each block. From 8ea467d3f6344ed02b0dac48a3f4c3baad21b14b Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 14 May 2025 22:33:33 +0000 Subject: [PATCH 05/58] Clean up a lot! Signed-off-by: mgoin --- .../kv_connector/v1/nixl_connector.py | 155 ++++++------------ 1 file changed, 46 insertions(+), 109 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 131652ad1cd5..02bae470751e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -88,7 +88,6 @@ class NixlConnector(KVConnectorBase_V1): def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): assert vllm_config.kv_transfer_config is not None - self.vllm_config = vllm_config self.engine_id = vllm_config.kv_transfer_config.engine_id if role == KVConnectorRole.SCHEDULER: @@ -98,7 +97,7 @@ def __init__(self, vllm_config: VllmConfig, role: KVConnectorRole): elif role == KVConnectorRole.WORKER: self.connector_scheduler = None self.connector_worker = NixlConnectorWorker( - str(self.engine_id), vllm_config) + vllm_config, str(self.engine_id)) ############################################################ # Scheduler Side Methods @@ -304,7 +303,7 @@ def request_finished( class NixlConnectorWorker: """Implementation of Worker side methods""" - def __init__(self, engine_id: str, vllm_config: VllmConfig): + def __init__(self, vllm_config: VllmConfig, engine_id: str): if NixlWrapper is None: logger.error("NIXL is not available") raise RuntimeError("NIXL is not available") @@ -361,88 +360,9 @@ def __init__(self, engine_id: str, vllm_config: VllmConfig): self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size - def _get_layer_index_from_name(self, layer_name: str) -> Optional[int]: - """ - Parses the numerical index from a layer name string. - Example: "model.layers.0.self_attn" -> 0 - """ - parts = layer_name.split('.') - for i, part in enumerate(parts): - if part == "layers" and i + 1 < len(parts): - try: - return int(parts[i + 1]) - except ValueError: - logger.warning( - f"Could not parse layer index from part '{parts[i+1]}' in layer name '{layer_name}'" - ) - return None - logger.debug( - f"Could not find 'layers.X' pattern in layer name '{layer_name}' to extract index." - ) - return None - - def _get_llama4_layer_strategy(self, layer_name: str) -> Optional[int]: - """ - Determines the transfer strategy for a given layer type. - At the moment this is used for Llama 4 models (RoPE vs NoPE). - Returns: - Optional[int]: chunk_size - """ - layer_idx = self._get_layer_index_from_name(layer_name) - if layer_idx is None: - logger.debug( - f"Could not determine layer index for '{layer_name}'. Defaulting to global transfer." - ) - return None - - model_cfg = self.vllm_config.model_config - if not hasattr(model_cfg, 'hf_text_config'): - logger.debug( - "Model config does not have 'hf_text_config'. Defaulting to global transfer for layer %s.", - layer_name) - return None - - model_hf_text_config = model_cfg.hf_text_config - if not hasattr(model_hf_text_config, 'no_rope_layers'): - logger.debug( - "Model hf_text_config does not have 'no_rope_layers'. Defaulting to global transfer for layer %s.", - layer_name) - return None - elif not hasattr(model_hf_text_config, 'attention_chunk_size'): - logger.debug( - "Model hf_text_config does not have 'attention_chunk_size'. Defaulting to global transfer for layer %s.", - layer_name) - return None - - no_rope_layers = model_hf_text_config.no_rope_layers - if not isinstance(no_rope_layers, list) or not no_rope_layers: - logger.debug( - "'no_rope_layers' is not a valid list. Defaulting to global transfer for layer %s.", - layer_name) - return None - - chunk_size = getattr(model_hf_text_config, 'attention_chunk_size', - None) - if not isinstance(chunk_size, int) or chunk_size <= 0: - logger.debug( - "'attention_chunk_size' is not a valid integer. Defaulting to global transfer for layer %s.", - layer_name) - return None - - if not (0 <= layer_idx < len(no_rope_layers)): - logger.debug( - f"Layer index {layer_idx} is out of bounds for 'no_rope_layers' list (len {len(no_rope_layers)}). Defaulting to global transfer for layer %s.", - layer_name) - return None - - # Llama 4 specific logic: - # no_rope_layers[layer_idx] == 0 means NoPE (global) - # Any other value means RoPE (local chunked) - if no_rope_layers[layer_idx] == 0: - return None # Global attention - else: - # Llama 4 RoPE layers have a fixed chunk_size token window - return chunk_size + # Llama 4 specific logic + # List of block window sizes for each layer for local attention + self.block_window_per_layer: list[Optional[int]] = [] @staticmethod def _nixl_handshake_listener(metadata: NixlAgentMetadata, @@ -556,6 +476,24 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): self.num_regions = len(caches_data) self.num_layers = len(self.kv_caches.keys()) + # Local attention chunking optimization (Llama 4) + if self.vllm_config.model_config.model_type == "llama4": + from transformers import Llama4TextConfig + assert isinstance(self.vllm_config.model_config.hf_text_config, + Llama4TextConfig) + llama4_config = self.vllm_config.model_config.hf_text_config + for layer_idx, _ in enumerate(self.kv_caches.keys()): + no_rope_layers = llama4_config.no_rope_layers + # no_rope_layers[layer_idx] == 0 means NoPE (global) + # Any other value means RoPE (local chunked) + chunk_size = None if no_rope_layers[ + layer_idx] == 0 else llama4_config.attention_chunk_size + chunkblock__size = math.ceil(chunk_size / self.block_size) + self.block_window_per_layer.append(chunkblock__size) + logger.debug("Llama 4 block window per layer mapping: %s", + self.block_window_per_layer) + assert len(self.block_window_per_layer) == self.num_layers + descs = self.nixl_wrapper.get_reg_descs(caches_data, "VRAM") logger.debug("Registering descs: %s", caches_data) self.nixl_wrapper.register_memory(descs) @@ -787,26 +725,28 @@ def _read_blocks( local_block_descs_ids: list[int] = [] remote_block_descs_ids: list[int] = [] - # Iterate through all layers this worker is responsible for - for layer_idx, layer_name in enumerate(self.kv_caches.keys()): - - if chunk_size := self._get_llama4_layer_strategy(layer_name): - # Llama 4 specific logic - logger.debug( - f"Layer {layer_name} is local attention with chunk_size: {chunk_size}" - ) - num_blocks_for_window = math.ceil(chunk_size / self.block_size) - # Get the last num_blocks_for_window blocks - layer_local_block_ids = local_block_ids[ - -num_blocks_for_window:] - layer_remote_block_ids = remote_block_ids[ - -num_blocks_for_window:] - else: - logger.debug(f"Layer {layer_name} is global attention") - # If the layer is not chunked, we just use the - # full block lists (global attention) - layer_local_block_ids = local_block_ids - layer_remote_block_ids = remote_block_ids + # Get descs ids. + if not self.block_window_per_layer: + # Default case: assume global attention + remote_block_descs_ids = self._get_block_descs_ids( + dst_engine_id, remote_block_ids) + local_block_descs_ids = self._get_block_descs_ids( + self.engine_id, local_block_ids) + assert len(local_block_descs_ids) == len(remote_block_descs_ids) + else: + # Llama 4 specific case: local attention with chunking + for layer_idx, block_window in enumerate( + self.block_window_per_layer): + # For each layer: + if block_window is None: + # If not chunked, we just use the + # full block lists (global attention) + layer_local_block_ids = local_block_ids + layer_remote_block_ids = remote_block_ids + else: + # If chunked, get the last block_window blocks + layer_local_block_ids = local_block_ids[-block_window:] + layer_remote_block_ids = remote_block_ids[-block_window:] # Get descs ids for the layer. layer_remote_desc_ids = self._get_block_descs_ids( @@ -817,9 +757,6 @@ def _read_blocks( remote_block_descs_ids.extend(layer_remote_desc_ids) local_block_descs_ids.extend(layer_local_desc_ids) - logger.debug( - f"NIXL READ for {len(local_block_descs_ids)} local descs and {len(remote_block_descs_ids)} remote descs" - ) assert len(local_block_descs_ids) == len(remote_block_descs_ids) # Get side handles. @@ -858,7 +795,7 @@ def _get_block_descs_ids(self, assert layer_idx < self.num_layers if self.num_layers < self.num_regions: # If we have more regions than layers, we assume that - # the regions are organized as [K, V, K, V, ...] + # the regions are organized as [K0, V0, K1, V1, ...] assert 2 * self.num_layers == self.num_regions region_ids = [2 * layer_idx, 2 * layer_idx + 1] else: From 73a8272afd3ca49cb245c4001adf48447f3a16c0 Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 14 May 2025 22:37:10 +0000 Subject: [PATCH 06/58] Small opt Signed-off-by: mgoin --- .../kv_connector/v1/nixl_connector.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 02bae470751e..5b4112eab286 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -360,7 +360,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): self.vllm_config = vllm_config self.block_size = vllm_config.cache_config.block_size - # Llama 4 specific logic + # Optimization for models with local attention (Llama 4 for now) # List of block window sizes for each layer for local attention self.block_window_per_layer: list[Optional[int]] = [] @@ -482,14 +482,14 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): assert isinstance(self.vllm_config.model_config.hf_text_config, Llama4TextConfig) llama4_config = self.vllm_config.model_config.hf_text_config - for layer_idx, _ in enumerate(self.kv_caches.keys()): + for layer_idx in range(self.num_layers): no_rope_layers = llama4_config.no_rope_layers # no_rope_layers[layer_idx] == 0 means NoPE (global) # Any other value means RoPE (local chunked) chunk_size = None if no_rope_layers[ layer_idx] == 0 else llama4_config.attention_chunk_size - chunkblock__size = math.ceil(chunk_size / self.block_size) - self.block_window_per_layer.append(chunkblock__size) + chunk_block_size = math.ceil(chunk_size / self.block_size) + self.block_window_per_layer.append(chunk_block_size) logger.debug("Llama 4 block window per layer mapping: %s", self.block_window_per_layer) assert len(self.block_window_per_layer) == self.num_layers @@ -722,10 +722,13 @@ def _read_blocks( if num_local_blocks < num_remote_blocks: remote_block_ids = remote_block_ids[-num_local_blocks:] - local_block_descs_ids: list[int] = [] - remote_block_descs_ids: list[int] = [] + # Get side handles. + local_xfer_side_handle = self.src_xfer_side_handle + remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id] # Get descs ids. + local_block_descs_ids: list[int] = [] + remote_block_descs_ids: list[int] = [] if not self.block_window_per_layer: # Default case: assume global attention remote_block_descs_ids = self._get_block_descs_ids( @@ -759,10 +762,6 @@ def _read_blocks( assert len(local_block_descs_ids) == len(remote_block_descs_ids) - # Get side handles. - local_xfer_side_handle = self.src_xfer_side_handle - remote_xfer_side_handle = self.dst_xfer_side_handles[dst_engine_id] - # Prepare transfer with Nixl. handle = self.nixl_wrapper.make_prepped_xfer( "READ", From 749f792553d48dad68855f32910fafa61a28297e Mon Sep 17 00:00:00 2001 From: David Xia Date: Wed, 14 May 2025 18:43:32 -0400 Subject: [PATCH 07/58] [Frontend] decrease import time of vllm.multimodal (#18031) Co-authored-by: Aaron Pham --- vllm/multimodal/inputs.py | 45 +++++++++++++++++++---------------- vllm/multimodal/parse.py | 21 +++++++++------- vllm/multimodal/processing.py | 13 ++++++---- 3 files changed, 45 insertions(+), 34 deletions(-) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 61d8eb62ffaf..2335af843ed5 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -10,40 +10,43 @@ Union, cast, final) import numpy as np -import torch -import torch.types -from PIL.Image import Image -from transformers import BatchFeature from typing_extensions import NotRequired, TypeAlias from vllm.jsontree import JSONTree, json_map_leaves -from vllm.utils import full_groupby, is_list_of +from vllm.utils import LazyLoader, full_groupby, is_list_of if TYPE_CHECKING: + import torch + import torch.types + from PIL.Image import Image + from transformers.feature_extraction_utils import BatchFeature + from .hasher import MultiModalHashDict +else: + torch = LazyLoader("torch", globals(), "torch") _T = TypeVar("_T") -HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor] +HfImageItem: TypeAlias = Union["Image", np.ndarray, "torch.Tensor"] """ A {class}`transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. """ -HfVideoItem: TypeAlias = Union[list[Image], np.ndarray, torch.Tensor, - list[np.ndarray], list[torch.Tensor]] +HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor", + list[np.ndarray], list["torch.Tensor"]] """ A {class}`transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace `VideoProcessor`. """ -HfAudioItem: TypeAlias = Union[list[float], np.ndarray, torch.Tensor] +HfAudioItem: TypeAlias = Union[list[float], np.ndarray, "torch.Tensor"] """ Represents a single audio item, which can be passed to a HuggingFace `AudioProcessor`. """ -ImageItem: TypeAlias = Union[HfImageItem, torch.Tensor] +ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"] """ A {class}`transformers.image_utils.ImageInput` representing a single image item, which can be passed to a HuggingFace `ImageProcessor`. @@ -53,7 +56,7 @@ these are directly passed to the model without HF processing. """ -VideoItem: TypeAlias = Union[HfVideoItem, torch.Tensor] +VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor"] """ A {class}`transformers.image_utils.VideoInput` representing a single video item, which can be passed to a HuggingFace `VideoProcessor`. @@ -64,7 +67,7 @@ """ AudioItem: TypeAlias = Union[HfAudioItem, tuple[np.ndarray, float], - torch.Tensor] + "torch.Tensor"] """ Represents a single audio item, which can be passed to a HuggingFace `AudioProcessor`. @@ -132,7 +135,7 @@ class PlaceholderRange: length: int """The length of the placeholder.""" - is_embed: Optional[torch.Tensor] = None + is_embed: Optional["torch.Tensor"] = None """ A boolean mask of shape `(length,)` indicating which positions between `offset` and `offset + length` to assign embeddings to. @@ -158,8 +161,8 @@ def __eq__(self, other: object) -> bool: return nested_tensors_equal(self.is_embed, other.is_embed) -NestedTensors = Union[list["NestedTensors"], list[torch.Tensor], torch.Tensor, - tuple[torch.Tensor, ...]] +NestedTensors: TypeAlias = Union[list["NestedTensors"], list["torch.Tensor"], + "torch.Tensor", tuple["torch.Tensor", ...]] """ Uses a list instead of a tensor if the dimensions of each element do not match. """ @@ -261,7 +264,7 @@ def build_elems( """ Construct {class}`MultiModalFieldElem` instances to represent the provided data. - + This is the inverse of {meth}`reduce_data`. """ raise NotImplementedError @@ -422,7 +425,7 @@ def flat(modality: str, modality: The modality of the multi-modal item that uses this keyword argument. slices: For each multi-modal item, a slice (dim=0) or a tuple of - slices (dim>0) that is used to extract the data corresponding + slices (dim>0) that is used to extract the data corresponding to it. dim: The dimension to extract data, default to 0. @@ -465,7 +468,7 @@ def flat(modality: str, @staticmethod def flat_from_sizes(modality: str, - size_per_item: torch.Tensor, + size_per_item: "torch.Tensor", dim: int = 0): """ Defines a field where an element in the batch is obtained by @@ -602,7 +605,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): @staticmethod def from_hf_inputs( - hf_inputs: BatchFeature, + hf_inputs: "BatchFeature", config_by_key: Mapping[str, MultiModalFieldConfig], ): # NOTE: This skips fields in `hf_inputs` that are not in `config_by_key` @@ -792,7 +795,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: return self._items_by_modality[modality] -MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] +MultiModalPlaceholderDict: TypeAlias = Mapping[str, Sequence[PlaceholderRange]] """ A dictionary containing placeholder ranges for each modality. """ @@ -823,7 +826,7 @@ class MultiModalInputs(TypedDict): mm_hashes: Optional["MultiModalHashDict"] """The hashes of the multi-modal data.""" - mm_placeholders: MultiModalPlaceholderDict + mm_placeholders: "MultiModalPlaceholderDict" """ For each modality, information about the placeholder tokens in `prompt_token_ids`. diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index f9588431c8ef..6e9ec9555802 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -8,11 +8,9 @@ import numpy as np import torch -from PIL.Image import Image -from transformers import BatchFeature from typing_extensions import TypeAlias, TypeGuard, assert_never -from vllm.utils import is_list_of +from vllm.utils import LazyLoader, is_list_of from .audio import AudioResampler from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, @@ -22,6 +20,11 @@ _T = TypeVar("_T") _I = TypeVar("_I") +if TYPE_CHECKING: + import PIL.Image as PILImage +else: + PILImage = LazyLoader("PILImage", globals(), "PIL.Image") + class ModalityDataItems(ABC, Generic[_T, _I]): """ @@ -131,6 +134,8 @@ def __init__( Mapping[str, MultiModalFieldConfig], ], ) -> None: + from transformers.feature_extraction_utils import BatchFeature + super().__init__(data, modality) missing_required_data_keys = required_fields - data.keys() @@ -200,7 +205,7 @@ def __init__(self, data: Sequence[HfImageItem]) -> None: def get_image_size(self, item_idx: int) -> ImageSize: image = self.get(item_idx) - if isinstance(image, Image): + if isinstance(image, PILImage.Image): return ImageSize(*image.size) if isinstance(image, (np.ndarray, torch.Tensor)): _, h, w = image.shape @@ -226,7 +231,7 @@ def get_num_frames(self, item_idx: int) -> int: def get_frame_size(self, item_idx: int) -> ImageSize: image = self.get(item_idx)[0] # Assume that the video isn't empty - if isinstance(image, Image): + if isinstance(image, PILImage.Image): return ImageSize(*image.size) if isinstance(image, (np.ndarray, torch.Tensor)): _, h, w = image.shape @@ -253,7 +258,7 @@ class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): def get_count(self, modality: str, *, strict: bool = True) -> int: """ Get the number of data items belonging to a modality. - + If `strict=False`, return `0` instead of raising {exc}`KeyError` even if the modality is not found. """ @@ -399,7 +404,7 @@ def _parse_image_data( if self._is_embeddings(data): return ImageEmbeddingItems(data) - if (isinstance(data, Image) + if (isinstance(data, PILImage.Image) or isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 3): data_items = [data] @@ -420,7 +425,7 @@ def _parse_video_data( if self._is_embeddings(data): return VideoEmbeddingItems(data) - if (is_list_of(data, Image) + if (is_list_of(data, PILImage.Image) or isinstance(data, (np.ndarray, torch.Tensor)) and data.ndim == 4): data_items = [data] diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 92f9e70b5234..320a26f37555 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -13,7 +13,6 @@ TypeVar, Union, cast) import torch -from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from typing_extensions import assert_never from vllm.inputs import InputProcessingContext @@ -31,6 +30,10 @@ MultiModalDataParser) if TYPE_CHECKING: + from transformers.configuration_utils import PretrainedConfig + from transformers.feature_extraction_utils import BatchFeature + from transformers.processing_utils import ProcessorMixin + from .profiling import BaseDummyInputsBuilder logger = init_logger(__name__) @@ -1047,10 +1050,10 @@ def model_id(self) -> str: def get_tokenizer(self) -> AnyTokenizer: return self.ctx.tokenizer - def get_hf_config(self) -> PretrainedConfig: + def get_hf_config(self) -> "PretrainedConfig": return self.ctx.get_hf_config() - def get_hf_processor(self, **kwargs: object) -> ProcessorMixin: + def get_hf_processor(self, **kwargs: object) -> "ProcessorMixin": """ Subclasses can override this method to handle specific kwargs from model config or user inputs. @@ -1165,7 +1168,7 @@ def _to_mm_items( @abstractmethod def _get_mm_fields_config( self, - hf_inputs: BatchFeature, + hf_inputs: "BatchFeature", hf_processor_mm_kwargs: Mapping[str, object], ) -> Mapping[str, MultiModalFieldConfig]: """Given the HF-processed data, output the metadata of each field.""" @@ -1222,7 +1225,7 @@ def _call_hf_processor( # This refers to the data to be passed to HF processor. mm_data: Mapping[str, object], mm_kwargs: Mapping[str, object], - ) -> BatchFeature: + ) -> "BatchFeature": """ Call the HF processor on the prompt text and associated multi-modal data. From d93c976a0d78639d0ea9074a9e01607f0d5c5670 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Wed, 14 May 2025 18:43:55 -0400 Subject: [PATCH 08/58] [Kernel] Have rotary embeddings support tensors (#18046) Signed-off-by: Lucas Wilkinson --- csrc/pos_encoding_kernels.cu | 40 ++++++++++++++------- tests/kernels/core/test_pos_encoding.py | 14 +++++++- tests/kernels/core/test_rotary_embedding.py | 17 +++++++-- vllm/_custom_ops.py | 19 ++-------- 4 files changed, 59 insertions(+), 31 deletions(-) diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu index ef6dd1c0978d..266f2a0667a2 100644 --- a/csrc/pos_encoding_kernels.cu +++ b/csrc/pos_encoding_kernels.cu @@ -44,7 +44,8 @@ inline __device__ void apply_rotary_embedding( // head_size] const scalar_t* cache_ptr, const int head_size, const int num_heads, const int num_kv_heads, const int rot_dim, const int token_idx, - const int64_t query_stride, const int64_t key_stride) { + const int64_t query_stride, const int64_t key_stride, + const int64_t head_stride) { const int embed_dim = rot_dim / 2; const scalar_t* cos_ptr = cache_ptr; const scalar_t* sin_ptr = cache_ptr + embed_dim; @@ -52,7 +53,8 @@ inline __device__ void apply_rotary_embedding( const int nq = num_heads * embed_dim; for (int i = threadIdx.x; i < nq; i += blockDim.x) { const int head_idx = i / embed_dim; - const int64_t token_head = token_idx * query_stride + head_idx * head_size; + const int64_t token_head = + token_idx * query_stride + head_idx * head_stride; const int rot_offset = i % embed_dim; apply_token_rotary_embedding( query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); @@ -62,7 +64,8 @@ inline __device__ void apply_rotary_embedding( const int nk = num_kv_heads * embed_dim; for (int i = threadIdx.x; i < nk; i += blockDim.x) { const int head_idx = i / embed_dim; - const int64_t token_head = token_idx * key_stride + head_idx * head_size; + const int64_t token_head = + token_idx * key_stride + head_idx * head_stride; const int rot_offset = i % embed_dim; apply_token_rotary_embedding( key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim); @@ -84,7 +87,8 @@ __global__ void rotary_embedding_kernel( const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // // 2] const int rot_dim, const int64_t query_stride, const int64_t key_stride, - const int num_heads, const int num_kv_heads, const int head_size) { + const int64_t head_stride, const int num_heads, const int num_kv_heads, + const int head_size) { // Each thread block is responsible for one token. const int token_idx = blockIdx.x; int64_t pos = positions[token_idx]; @@ -92,7 +96,7 @@ __global__ void rotary_embedding_kernel( apply_rotary_embedding( query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, - token_idx, query_stride, key_stride); + token_idx, query_stride, key_stride, head_stride); } template @@ -109,9 +113,9 @@ __global__ void batched_rotary_embedding_kernel( const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // // 2] const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len] - // or [num_tokens] const int rot_dim, const int64_t query_stride, const int64_t key_stride, - const int num_heads, const int num_kv_heads, const int head_size) { + const int64_t head_stride, const int num_heads, const int num_kv_heads, + const int head_size) { // Each thread block is responsible for one token. const int token_idx = blockIdx.x; int64_t pos = positions[token_idx]; @@ -121,7 +125,7 @@ __global__ void batched_rotary_embedding_kernel( apply_rotary_embedding( query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim, - token_idx, query_stride, key_stride); + token_idx, query_stride, key_stride, head_stride); } } // namespace vllm @@ -179,6 +183,12 @@ void rotary_embedding( int seq_dim_idx = positions_ndim - 1; int64_t query_stride = query.stride(seq_dim_idx); int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0; + // Determine head stride: for [*, heads, head_size] use stride of last dim; + // for flat [*, heads*head_size], heads blocks are contiguous of size + // head_size + int query_ndim = query.dim(); + int64_t head_stride = + (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size; dim3 grid(num_tokens); dim3 block(std::min(num_heads * rot_dim / 2, 512)); @@ -190,14 +200,14 @@ void rotary_embedding( positions.data_ptr(), query.data_ptr(), key.has_value() ? key->data_ptr() : nullptr, cos_sin_cache.data_ptr(), rot_dim, query_stride, key_stride, - num_heads, num_kv_heads, head_size); + head_stride, num_heads, num_kv_heads, head_size); } else { vllm::rotary_embedding_kernel <<>>( positions.data_ptr(), query.data_ptr(), key.has_value() ? key->data_ptr() : nullptr, cos_sin_cache.data_ptr(), rot_dim, query_stride, - key_stride, num_heads, num_kv_heads, head_size); + key_stride, head_stride, num_heads, num_kv_heads, head_size); } }); } @@ -263,6 +273,12 @@ void batched_rotary_embedding( int seq_dim_idx = positions_ndim - 1; int64_t query_stride = query.stride(seq_dim_idx); int64_t key_stride = key.has_value() ? key->stride(seq_dim_idx) : 0; + // Determine head stride: for [*, heads, head_size] use stride of last dim; + // for flat [*, heads*head_size], heads blocks are contiguous of size + // head_size + int query_ndim = query.dim(); + int64_t head_stride = + (query_ndim == positions_ndim + 2) ? query.stride(-2) : head_size; dim3 grid(num_tokens); dim3 block(std::min(num_heads * rot_dim / 2, 512)); @@ -276,7 +292,7 @@ void batched_rotary_embedding( key.has_value() ? key->data_ptr() : nullptr, cos_sin_cache.data_ptr(), cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, - key_stride, num_heads, num_kv_heads, head_size); + key_stride, head_stride, num_heads, num_kv_heads, head_size); } else { vllm::batched_rotary_embedding_kernel <<>>( @@ -284,7 +300,7 @@ void batched_rotary_embedding( key.has_value() ? key->data_ptr() : nullptr, cos_sin_cache.data_ptr(), cos_sin_cache_offsets.data_ptr(), rot_dim, query_stride, - key_stride, num_heads, num_kv_heads, head_size); + key_stride, head_stride, num_heads, num_kv_heads, head_size); } }); } diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index d81c7487b88c..383a3c83b84a 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -29,12 +29,20 @@ def _get_flat_tensor_shape(batch_size: int, seq_len: int, num_heads: int, return (batch_size, seq_len, num_heads * head_size) +# For testing sliced tensors +def _get_padded_tensor_shape(batch_size: int, seq_len: int, num_heads: int, + head_size: int) -> tuple[int, ...]: + return (batch_size, seq_len, num_heads, head_size + 64) + + def _get_batch_tensor_shape(batch_size: int, seq_len: int, num_heads: int, head_size: int) -> tuple[int, ...]: return (batch_size, seq_len, num_heads, head_size) -TENSORS_SHAPES_FN = [_get_batch_tensor_shape, _get_flat_tensor_shape] +TENSORS_SHAPES_FN = [ + _get_batch_tensor_shape, _get_flat_tensor_shape, _get_padded_tensor_shape +] @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) @@ -79,6 +87,10 @@ def test_rotary_embedding( query = torch.randn(query_shape, dtype=dtype) key = torch.randn_like(query) if use_key else None + # slice tensor if required, noop otherwise + query = query[..., :head_size] + key = key[..., :head_size] if use_key else None + # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. ref_query, ref_key = rope.forward_native(positions, query, key) diff --git a/tests/kernels/core/test_rotary_embedding.py b/tests/kernels/core/test_rotary_embedding.py index 4e54861005f2..8383f943b9fa 100644 --- a/tests/kernels/core/test_rotary_embedding.py +++ b/tests/kernels/core/test_rotary_embedding.py @@ -38,9 +38,10 @@ def rotary_embedding_opcheck(rot, @pytest.mark.parametrize("head_size", [32, 108]) @pytest.mark.parametrize("seq_len", [11, 1024]) @pytest.mark.parametrize("use_key", [True, False]) +@pytest.mark.parametrize("head_stride_is_contingous", [True, False]) def test_rotary_embedding_opcheck(dist_init, device, max_position, is_neox_style, rotary_dim, head_size, - seq_len, use_key): + seq_len, use_key, head_stride_is_contingous): batch_size = 1 base = 10000 num_heads = 7 @@ -50,15 +51,27 @@ def test_rotary_embedding_opcheck(dist_init, device, max_position, positions = torch.randint(0, max_position, (batch_size, seq_len), device=device) + head_stride = head_size + (64 if head_stride_is_contingous else 0) + query = torch.randn(batch_size, seq_len, - num_heads * head_size, + num_heads, + head_stride, dtype=torch.float32, device=device) key = torch.randn_like(query) if use_key else None + query = query[..., :head_size] + key = key[..., :head_size] if use_key else None rotary_embedding_opcheck(rot, positions, query, key) offsets = torch.zeros(batch_size * seq_len, device=device, dtype=torch.long) rotary_embedding_opcheck(rot, positions, query, key, offsets) + + # if we have a contiguous head stride, test the alternate + # [..., num_heads * head_dim] shape/layout + if head_stride_is_contingous: + rotary_embedding_opcheck( + rot, positions, query.flatten(start_dim=-2), + key.flatten(start_dim=-2) if use_key else None) diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index c81300db5657..e74d139ab980 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -254,14 +254,8 @@ def rotary_embedding( cos_sin_cache: torch.Tensor, is_neox: bool, ) -> None: - # TODO: Remove this contiguous call when the kernel is updated to support tensor slices - query_contiguous = query.contiguous() - key_contiguous = key.contiguous() if key is not None else None - torch.ops._C.rotary_embedding(positions, query_contiguous, key_contiguous, - head_size, cos_sin_cache, is_neox) - query.copy_(query_contiguous) - if key is not None: - key.copy_(key_contiguous) + torch.ops._C.rotary_embedding(positions, query, key, head_size, + cos_sin_cache, is_neox) def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, @@ -269,16 +263,9 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor, cos_sin_cache: torch.Tensor, is_neox: bool, rot_dim: int, cos_sin_cache_offsets: torch.Tensor) -> None: - # TODO: Remove this contiguous call when the kernel is updated to support tensor slices - query_contiguous = query.contiguous() - key_contiguous = key.contiguous() if key is not None else None - torch.ops._C.batched_rotary_embedding(positions, query_contiguous, - key_contiguous, head_size, + torch.ops._C.batched_rotary_embedding(positions, query, key, head_size, cos_sin_cache, is_neox, rot_dim, cos_sin_cache_offsets) - query.copy_(query_contiguous) - if key is not None: - key.copy_(key_contiguous) # layer norm ops From 2fc9075b82e05007f460992b3f9d42d2746c41cb Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Wed, 14 May 2025 18:45:24 -0400 Subject: [PATCH 09/58] [V1] Structured Outputs + Thinking compatibility (#16577) Signed-off-by: Aaron Pham Co-authored-by: Russell Bryant --- docs/source/features/reasoning_outputs.md | 4 +- .../llm/test_struct_output_generate.py | 96 ++++++++++++++++- vllm/config.py | 4 +- vllm/reasoning/abs_reasoning_parsers.py | 6 +- vllm/v1/core/sched/scheduler.py | 8 +- vllm/v1/structured_output/__init__.py | 101 +++++++++++++++--- vllm/v1/structured_output/backend_guidance.py | 22 ++-- vllm/v1/structured_output/backend_types.py | 17 ++- vllm/v1/structured_output/backend_xgrammar.py | 49 ++++----- vllm/v1/structured_output/request.py | 1 + 10 files changed, 233 insertions(+), 75 deletions(-) diff --git a/docs/source/features/reasoning_outputs.md b/docs/source/features/reasoning_outputs.md index 4759d0c26c35..3c2571298e4f 100644 --- a/docs/source/features/reasoning_outputs.md +++ b/docs/source/features/reasoning_outputs.md @@ -141,10 +141,10 @@ Remember to check whether the `reasoning_content` exists in the response before The reasoning content is also available in the structured output. The structured output engine like `xgrammar` will use the reasoning content to generate structured output. It is only supported in v0 engine now. ```bash -VLLM_USE_V1=0 vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 +vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1 ``` -Please note that the `VLLM_USE_V1` environment variable must be set to `0` to use the v0 engine. +The following is an example client: ```python from openai import OpenAI diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py index 5c116598ff3f..25bbcd901d6a 100644 --- a/tests/v1/entrypoints/llm/test_struct_output_generate.py +++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py @@ -1,3 +1,4 @@ +# ruff: noqa: E501 # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -5,17 +6,22 @@ import json import re from enum import Enum -from typing import Any +from typing import TYPE_CHECKING, Any import jsonschema import pytest from pydantic import BaseModel +from tests.reasoning.utils import run_reasoning_extraction from vllm.entrypoints.llm import LLM from vllm.outputs import RequestOutput from vllm.platforms import current_platform +from vllm.reasoning.abs_reasoning_parsers import ReasoningParserManager from vllm.sampling_params import GuidedDecodingParams, SamplingParams +if TYPE_CHECKING: + from vllm.config import TokenizerMode + NGRAM_SPEC_CONFIG = { "model": "[ngram]", "num_speculative_tokens": 5, @@ -444,7 +450,7 @@ def test_structured_output( prompt = """ You have access to the following function to retrieve the weather in a city: - + { "name": "get_weather", "parameters": { @@ -455,7 +461,7 @@ def test_structured_output( } } } - + If a you choose to call a function ONLY reply in the following format: <{start_tag}={function_name}>{parameters}{end_tag} where @@ -476,7 +482,7 @@ def test_structured_output( - Always add your sources when using search results to answer the user query You are a helpful assistant. - + Given the previous instructions, what is the weather in New York City? \ Make the response as short as possible. """ @@ -514,6 +520,88 @@ def test_structured_output( f"{generated_text!r}\nError: {str(e)}") +@pytest.mark.skip_global_cleanup +@pytest.mark.parametrize( + "model_name, guided_decoding_backend, tokenizer_mode, reasoning_parser, speculative_config", # noqa: E501 + [ + ("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "xgrammar", "auto", + "deepseek_r1", NGRAM_SPEC_CONFIG), + ("Qwen/Qwen3-1.7B", "xgrammar", "auto", "deepseek_r1", None), + ], +) +def test_structured_output_with_reasoning_matrices( + monkeypatch: pytest.MonkeyPatch, + guided_decoding_backend: str, + tokenizer_mode: TokenizerMode, + reasoning_parser: str, + model_name: str, + speculative_config: dict[str, Any] | None, +): + monkeypatch.setenv("VLLM_USE_V1", "1") + + if current_platform.is_tpu() and speculative_config: + pytest.skip("TPU does not support speculative decoding") + + # Use a single LLM instance for several scenarios to + # speed up the test suite. + llm = LLM( + model=model_name, + # Don't use eager execution on TPUs because we want to test for no + # recompilation at runtime + enforce_eager=bool(not current_platform.is_tpu()), + max_model_len=1024, + max_num_seqs=16, + guided_decoding_backend=guided_decoding_backend, + guided_decoding_disable_any_whitespace=True, + tokenizer_mode=tokenizer_mode, + reasoning_parser=reasoning_parser, + speculative_config=speculative_config, + ) + tokenizer = llm.get_tokenizer(None) + reasoner = ReasoningParserManager.get_reasoning_parser(reasoning_parser)( + tokenizer=tokenizer) + + reasoning_prompt = "Solve the following math problem step-by-step, then provide the final answer as JSON object with a single key 'result'. Make sure to correct your reasoning if there are any issue should it arise.\nProblem: What is 5 * 8 + 2?" # noqa: E501 + reasoning_schema = { + "type": "object", + "properties": { + "result": { + "type": "integer" + } + }, + "required": ["result"], + "additionalProperties": False + } + if "Qwen3" in model_name: + reasoning_prompt += "\n" + + sampling_params = SamplingParams( + temperature=0.1, + max_tokens=8192, + guided_decoding=GuidedDecodingParams(json=reasoning_schema), + ) + outputs = llm.generate( + [reasoning_prompt], + sampling_params=sampling_params, + use_tqdm=True, + ) + + assert outputs is not None + output = outputs[0] + assert output is not None and isinstance(output, RequestOutput) + prompt = output.prompt + generated_text = output.outputs[0].text + reasoning_content, content = run_reasoning_extraction( + reasoner, [generated_text]) + print( + f"Prompt: {prompt!r}\nReasoning: {reasoning_content!r}\nContent: {content!r}" + ) + + assert content is not None and reasoning_content is not None + output_json = json.loads(content) + jsonschema.validate(instance=output_json, schema=reasoning_schema) + + @pytest.mark.skip_global_cleanup @pytest.mark.parametrize("model_name, tokenizer_mode", PARAMS_MODELS_TOKENIZER_MODE) diff --git a/vllm/config.py b/vllm/config.py index c5d61405c839..09e89c1116f1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -2332,7 +2332,7 @@ class SpeculativeConfig: `TypicalAcceptanceSampler`.""" speculative_token_tree: Optional[str] = None - """Specifies the tree structure for speculative token generation. + """Specifies the tree structure for speculative token generation. """ # required configuration params passed from engine target_model_config: ModelConfig = field(default=None, @@ -4024,7 +4024,7 @@ class VllmConfig: """LoRA configuration.""" speculative_config: Optional[SpeculativeConfig] = None """Speculative decoding configuration.""" - decoding_config: Optional[DecodingConfig] = None + decoding_config: DecodingConfig = field(default_factory=DecodingConfig) """Decoding configuration.""" observability_config: Optional[ObservabilityConfig] = None """Observability configuration.""" diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index 454167a0dc95..9dd5191da918 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import os from abc import abstractmethod from collections.abc import Sequence @@ -33,7 +35,7 @@ def vocab(self) -> dict[str, int]: return self.model_tokenizer.get_vocab() @abstractmethod - def is_reasoning_end(self, input_ids: list[int]) -> bool: + def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: """ Check if the reasoning content ends in the input_ids. @@ -106,7 +108,7 @@ class ReasoningParserManager: reasoning_parsers: dict[str, type] = {} @classmethod - def get_reasoning_parser(cls, name) -> type: + def get_reasoning_parser(cls, name: str | None) -> type[ReasoningParser]: """ Get reasoning parser by name which is registered by `register_module`. diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index f338e4ba1440..96313c288f7d 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -758,7 +758,8 @@ def update_from_output( # the outer lists can be of length > 1. new_logprobs = logprobs.slice(req_index, req_index + 1) - if new_token_ids and request.use_structured_output: + if new_token_ids and self.structured_output_manager.should_advance( + request): # NOTE: structured_output_request # should not be None if use_structured_output, we have # check above, so safe to ignore type warning @@ -767,11 +768,10 @@ def update_from_output( # Add newly generated spec token ids to the request. if spec_token_ids is not None: - if request.use_structured_output: + if self.structured_output_manager.should_advance(request): metadata = request.structured_output_request - assert metadata is not None and metadata.grammar is not None # Needs to happen after new_token_ids are accepted. - request.spec_token_ids = metadata.grammar.validate_tokens( + request.spec_token_ids = metadata.grammar.validate_tokens( # type: ignore[union-attr] spec_token_ids[req_index]) else: request.spec_token_ids = spec_token_ids[req_index] diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 3183edb7c94e..c701ab1d35a5 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -7,16 +7,23 @@ from vllm.config import VllmConfig from vllm.logger import init_logger +from vllm.reasoning import ReasoningParserManager +from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs +from vllm.utils import LazyLoader from vllm.v1.structured_output.backend_guidance import GuidanceBackend from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, StructuredOutputGrammar) +from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend if TYPE_CHECKING: import numpy as np import numpy.typing as npt import torch + from vllm.reasoning import ReasoningParser from vllm.v1.request import Request +else: + torch = LazyLoader("torch", globals(), "torch") logger = init_logger(__name__) @@ -26,9 +33,11 @@ class StructuredOutputManager: def __init__(self, vllm_config: VllmConfig): self.backend: Optional[StructuredOutputBackend] = None + self.reasoner: Optional[ReasoningParser] = None self.vllm_config = vllm_config self._grammar_bitmask: Optional[torch.Tensor] = None + self._full_mask = torch.tensor(-1, dtype=torch.int32) # The default max_workers if not specified is the number of CPUs * 5, # which is way too high since these tasks are CPU-bound, not I/O bound. @@ -36,24 +45,43 @@ def __init__(self, vllm_config: VllmConfig): # compilation, so we set it to half the number of CPUs. max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) + self.tokenizer = init_tokenizer_from_configs( + model_config=self.vllm_config.model_config, + scheduler_config=self.vllm_config.scheduler_config, + lora_config=self.vllm_config.lora_config, + ).get_lora_tokenizer(None) + reasoning_backend = vllm_config.decoding_config.reasoning_backend + if reasoning_backend: + reasoner_cls = ReasoningParserManager.get_reasoning_parser( + reasoning_backend) + self.reasoner = reasoner_cls(tokenizer=self.tokenizer) def grammar_init(self, request: Request) -> None: if request.structured_output_request is None: return + if TYPE_CHECKING: + assert request.sampling_params.guided_decoding is not None + # Initialize the backend the first time it is needed. # # NOTE: We only support a single backend. We do NOT support different # backends on a per-request basis in V1 (for now, anyway...). if self.backend is None: backend = request.sampling_params.guided_decoding.backend + vocab_size = self.vllm_config.model_config.get_vocab_size() if backend == "xgrammar": - from vllm.v1.structured_output.backend_xgrammar import ( - XgrammarBackend) - - self.backend = XgrammarBackend(self.vllm_config) + self.backend = XgrammarBackend( + self.vllm_config, + tokenizer=self.tokenizer, + vocab_size=vocab_size, + ) elif backend == "guidance": - self.backend = GuidanceBackend(self.vllm_config) + self.backend = GuidanceBackend( + self.vllm_config, + tokenizer=self.tokenizer, + vocab_size=vocab_size, + ) else: raise ValueError( f"Unsupported structured output backend: {backend}") @@ -87,14 +115,14 @@ def grammar_bitmask( if not structured_output_request_ids: return None + max_num_spec_tokens = 0 + if self.vllm_config.speculative_config is not None: + max_num_spec_tokens = \ + self.vllm_config.speculative_config.num_speculative_tokens + if self._grammar_bitmask is None: assert self.backend is not None max_batch_size = self.vllm_config.scheduler_config.max_num_seqs - if self.vllm_config.speculative_config is not None: - max_num_spec_tokens = self.vllm_config.\ - speculative_config.num_speculative_tokens - else: - max_num_spec_tokens = 0 # Allocate a bitmask for each token needing to be checked: # one for each speculative position, and one more for the @@ -103,6 +131,7 @@ def grammar_bitmask( self.backend.allocate_token_bitmask( max_batch_size * (1 + max_num_spec_tokens)) + bitmask_tensor = self._grammar_bitmask # Generate a batched bitmask for all structured output requests. # When speculative decoding is enabled, we need to include multiple # masks for each request, one for each possible bonus token position. @@ -110,16 +139,30 @@ def grammar_bitmask( cumulative_index = 0 ordered_seq = sorted(structured_output_request_ids.items(), key=lambda x: x[1]) + + # Note that for thinking support, we will need to + # reset the relevant part of the bitmask for consequent + # request here. + bitmask_tensor[:(len(ordered_seq) * (1 + max_num_spec_tokens))].fill_( + self._full_mask) + # NOTE: This outer loop can likely be parallelized to improve # performance of bitmask generation for large batches. for req_id, _ in ordered_seq: request = requests[req_id].structured_output_request - assert request is not None and request.grammar is not None + if TYPE_CHECKING: + assert request is not None + assert request.grammar is not None + + apply_bitmask = ( + request.reasoning_ended if self.reasoner is not None else True + ) # noqa: E501 + state_advancements = 0 req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None] for i, token in enumerate(req_tokens): - if not request.grammar.is_terminated(): - request.grammar.fill_bitmask(self._grammar_bitmask, + if apply_bitmask and not request.grammar.is_terminated(): + request.grammar.fill_bitmask(bitmask_tensor, cumulative_index) if token is not None: # In order to generate the correct bitmask for each @@ -132,15 +175,41 @@ def grammar_bitmask( if state_advancements > 0: request.grammar.rollback(state_advancements) - bitmask_tensor = self._grammar_bitmask - if cumulative_index < self._grammar_bitmask.shape[0]: - bitmask_tensor = self._grammar_bitmask[:cumulative_index] + if cumulative_index < bitmask_tensor.shape[0]: + bitmask_tensor = bitmask_tensor[:cumulative_index] # After finishing with the xgrammar operations, we convert to # np.ndarray, because that is much more efficient for serialization # and deserialization when sending this to the GPU workers. return bitmask_tensor.numpy() + def should_advance(self, request: Request) -> bool: + if not request.use_structured_output: + return False + + # To determine whether we can advance the FSM. + # Supports thinking usage where we skip the reasoning components. + if TYPE_CHECKING: + assert request.structured_output_request is not None + assert request.structured_output_request.grammar is not None + # by default, we should always advance + # for cases that doesn't uses thinking mode. + if self.reasoner is not None: + structured_req = request.structured_output_request + + if structured_req.reasoning_ended: + return True + + # Check if reasoning ends in *this* step + if self.reasoner.is_reasoning_end(request.all_token_ids): + # Reasoning just ended, so we shouldn't advanced til + # next pass + structured_req.reasoning_ended = True + + return False + else: + return True + def clear_backend(self) -> None: if self.backend is not None: self.backend.destroy() diff --git a/vllm/v1/structured_output/backend_guidance.py b/vllm/v1/structured_output/backend_guidance.py index 0ab175e781e7..55c5f609095d 100644 --- a/vllm/v1/structured_output/backend_guidance.py +++ b/vllm/v1/structured_output/backend_guidance.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import copy import json import os @@ -8,10 +10,8 @@ import torch -from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.utils import LazyLoader from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, StructuredOutputGrammar, @@ -54,25 +54,17 @@ def process_for_additional_properties( return guide_json_obj +@dataclass class GuidanceBackend(StructuredOutputBackend): - def __init__(self, vllm_config: VllmConfig): - self.vllm_config = vllm_config - tokenizer_group = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) # type: ignore[arg-type] - self.vllm_config = vllm_config - self.vocab_size = vllm_config.model_config.get_vocab_size() - + def __post_init__(self): self.disable_any_whitespace = \ - vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.decoding_config.disable_any_whitespace self.disable_additional_properties = \ - vllm_config.decoding_config.disable_additional_properties + self.vllm_config.decoding_config.disable_additional_properties - tokenizer = tokenizer_group.get_lora_tokenizer(None) self.ll_tokenizer = llguidance_hf.from_tokenizer( - tokenizer, self.vocab_size) + self.tokenizer, self.vocab_size) def compile_grammar(self, request_type: StructuredOutputOptions, grammar_spec: str) -> StructuredOutputGrammar: diff --git a/vllm/v1/structured_output/backend_types.py b/vllm/v1/structured_output/backend_types.py index 33ca9f8cf484..09f6cdf73337 100644 --- a/vllm/v1/structured_output/backend_types.py +++ b/vllm/v1/structured_output/backend_types.py @@ -1,9 +1,17 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import enum from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + import torch -import torch + from vllm.config import VllmConfig + from vllm.transformers_utils.tokenizer import AnyTokenizer class StructuredOutputOptions(enum.Enum): @@ -85,9 +93,14 @@ def reset(self): """ +@dataclass class StructuredOutputBackend(ABC): """Engine-level backend for structured output requests.""" + vllm_config: VllmConfig + tokenizer: AnyTokenizer + vocab_size: int + @abstractmethod def compile_grammar(self, request_type: StructuredOutputOptions, grammar_spec: str) -> StructuredOutputGrammar: @@ -104,7 +117,7 @@ def compile_grammar(self, request_type: StructuredOutputOptions, """ @abstractmethod - def allocate_token_bitmask(self, max_num_seqs: int): + def allocate_token_bitmask(self, max_num_seqs: int) -> torch.Tensor: """ Allocates a token bitmask for the specified maximum number of sequences. diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index 2ce2be337ecb..f2570221da25 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import json from dataclasses import dataclass, field from typing import TYPE_CHECKING, Any @@ -7,10 +9,8 @@ import torch import vllm.envs -from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.sampling_params import SamplingParams -from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.transformers_utils.tokenizers.mistral import MistralTokenizer from vllm.utils import LazyLoader from vllm.v1.structured_output.backend_types import (StructuredOutputBackend, @@ -28,61 +28,49 @@ logger = init_logger(__name__) +@dataclass class XgrammarBackend(StructuredOutputBackend): - def __init__(self, vllm_config: VllmConfig): - self.vllm_config = vllm_config - tokenizer_group = init_tokenizer_from_configs( - model_config=vllm_config.model_config, - scheduler_config=vllm_config.scheduler_config, - lora_config=vllm_config.lora_config) # type: ignore[arg-type] - + def __post_init__(self): self.disable_any_whitespace = \ - vllm_config.decoding_config.disable_any_whitespace + self.vllm_config.decoding_config.disable_any_whitespace - self.num_speculative_tokens = 0 - if self.vllm_config.speculative_config is not None: - self.num_speculative_tokens = \ - self.vllm_config.speculative_config.num_speculative_tokens - - tokenizer = tokenizer_group.get_lora_tokenizer(None) - self.vocab_size = vllm_config.model_config.get_vocab_size() - if isinstance(tokenizer, MistralTokenizer): + if isinstance(self.tokenizer, MistralTokenizer): # NOTE: ideally, xgrammar should handle this accordingly. # refer to https://github.com/mlc-ai/xgrammar/blob/d77c0a0173ef14779c918e3be7966ba852f7910f/python/xgrammar/tokenizer_info.py#L98 try: - if tokenizer.is_tekken: - encoded_vocab = tokenizer._vocab + if self.tokenizer.is_tekken: + encoded_vocab = self.tokenizer._vocab else: encoded_vocab = [ token for token, _ in sorted( - tokenizer.get_vocab().items(), + self.tokenizer.get_vocab().items(), key=lambda x: x[1], ) ] stop_token_ids = None - if hasattr( - tokenizer, + if (hasattr( + self.tokenizer, "eos_token_id", - ) and tokenizer.eos_token_id is not None: - stop_token_ids = [tokenizer.eos_token_id] + ) and self.tokenizer.eos_token_id is not None): + stop_token_ids = [self.tokenizer.eos_token_id] except AttributeError as e: raise ValueError( f"Cannot get the vocabulary of the tokenizer " - f"{type(tokenizer)}. The tokenizer should have a " + f"{type(self.tokenizer)}. The tokenizer should have a " "get_vocab method.") from e tokenizer_info = xgr.TokenizerInfo( # type: ignore encoded_vocab=encoded_vocab, # NOTE: https://github.com/mlc-ai/xgrammar/blob/5e141f6ff1ca02bc31f9e512e68b61f2a8ae88e5/tests/python/test_tokenizer_info.py#L43 # noqa: E501 vocab_type=xgr.VocabType.RAW - if tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK, + if self.tokenizer.is_tekken else xgr.VocabType.BYTE_FALLBACK, vocab_size=self.vocab_size, stop_token_ids=stop_token_ids, add_prefix_space=True, ) else: tokenizer_info = xgr.TokenizerInfo.from_huggingface( - tokenizer, + self.tokenizer, vocab_size=self.vocab_size, ) self.compiler = xgr.GrammarCompiler( @@ -92,6 +80,11 @@ def __init__(self, vllm_config: VllmConfig): cache_limit_bytes=vllm.envs.VLLM_XGRAMMAR_CACHE_MB * 1024 * 1024, ) + self.num_speculative_tokens = 0 + if self.vllm_config.speculative_config is not None: + self.num_speculative_tokens = \ + self.vllm_config.speculative_config.num_speculative_tokens + def compile_grammar(self, request_type: StructuredOutputOptions, grammar_spec: str) -> StructuredOutputGrammar: if request_type == StructuredOutputOptions.JSON: diff --git a/vllm/v1/structured_output/request.py b/vllm/v1/structured_output/request.py index 6ef472eb896c..c16320b9e74c 100644 --- a/vllm/v1/structured_output/request.py +++ b/vllm/v1/structured_output/request.py @@ -20,6 +20,7 @@ class StructuredOutputRequest: sampling_params: SamplingParams _grammar: Optional[Union[Future[StructuredOutputGrammar], StructuredOutputGrammar]] = None + reasoning_ended: bool = False def _check_grammar_completion(self) -> bool: # NOTE: We have to lazy import to gate circular imports From 17cc4c9989846d51ff4532bb11e7b481feca2ca7 Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 14 May 2025 22:46:56 +0000 Subject: [PATCH 10/58] Fix mypy Signed-off-by: mgoin --- .../kv_transfer/kv_connector/v1/nixl_connector.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index 5b4112eab286..654a89c9c4d3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -795,12 +795,13 @@ def _get_block_descs_ids(self, if self.num_layers < self.num_regions: # If we have more regions than layers, we assume that # the regions are organized as [K0, V0, K1, V1, ...] + # and we select K_i and V_i assert 2 * self.num_layers == self.num_regions - region_ids = [2 * layer_idx, 2 * layer_idx + 1] + region_ids = range(2 * layer_idx, 2 * layer_idx + 2) else: - # Otherwise, we assume we have MLA + # Otherwise, we assume we have MLA and select i-th layer assert self.num_layers == self.num_regions - region_ids = [layer_idx] + region_ids = range(layer_idx, layer_idx + 1) num_blocks = self.dst_num_blocks[engine_id] From 7974736740aee195e454eb989cd9112577bc8652 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 14 May 2025 16:24:59 -0700 Subject: [PATCH 11/58] Add support for loading torchao models with `AOPerModuleConfig` (#17826) Signed-off-by: Jerry Zhang --- tests/quantization/test_torchao.py | 18 ++++++++++-- .../layers/quantization/torchao.py | 29 ++++++++++++++----- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/tests/quantization/test_torchao.py b/tests/quantization/test_torchao.py index 1a20228765e8..6571fc9e471b 100644 --- a/tests/quantization/test_torchao.py +++ b/tests/quantization/test_torchao.py @@ -31,9 +31,6 @@ def test_pre_quantized_model(vllm_runner): ]) def test_opt_125m_int4wo_model_loading_with_params(vllm_runner, pt_load_map_location): - """ - Test loading roberta-base model with no lm_head. - """ torch._dynamo.reset() model_name = "jerryzh168/opt-125m-int4wo" with vllm_runner(model_name=model_name, @@ -47,5 +44,20 @@ def test_opt_125m_int4wo_model_loading_with_params(vllm_runner, print(output) +@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available") +def test_opt_125m_int4wo_model_per_module_quant(vllm_runner): + torch._dynamo.reset() + model_name = "jerryzh168/opt-125m-int4wo-per-module" + with vllm_runner(model_name=model_name, + quantization="torchao", + dtype="bfloat16", + pt_load_map_location="cuda:0") as llm: + output = llm.generate_greedy(["The capital of France is"], + max_tokens=32) + + assert output + print(output) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/vllm/model_executor/layers/quantization/torchao.py b/vllm/model_executor/layers/quantization/torchao.py index 9b60775df96f..7f9f3e643bfa 100644 --- a/vllm/model_executor/layers/quantization/torchao.py +++ b/vllm/model_executor/layers/quantization/torchao.py @@ -5,10 +5,11 @@ import torch.nn.functional as F from torch.nn.parameter import Parameter -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization import QuantizationMethods from vllm.model_executor.layers.quantization.base_config import ( - QuantizationConfig) + QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.utils import set_weight_attrs @@ -55,10 +56,24 @@ def from_config(cls, config: dict[str, Any]) -> "TorchAOConfig": return cls(ao_config) def get_quant_method(self, layer: torch.nn.Module, - prefix: str) -> Optional["TorchAOLinearMethod"]: - if isinstance(layer, LinearBase): - return TorchAOLinearMethod(self) - return None + prefix: str) -> Optional["QuantizeMethodBase"]: + if not isinstance(layer, LinearBase): + return None + + from torchao.quantization import AOPerModuleConfig + + module_fqn = prefix + if isinstance(self.torchao_config, AOPerModuleConfig): + module_fqn_to_config = self.torchao_config.module_fqn_to_config + c = module_fqn_to_config.get( + module_fqn) or module_fqn_to_config.get("_default", None) + if c is not None: + current_torchao_config = TorchAOConfig(c) + return TorchAOLinearMethod(current_torchao_config) + else: + return UnquantizedLinearMethod() + + return TorchAOLinearMethod(self) def get_scaled_act_names(self) -> list[str]: return [] @@ -75,7 +90,7 @@ def torchao_quantize_param_data(param: torch.Tensor, """ from torchao.core.config import AOBaseConfig from torchao.quantization import quantize_ - assert isinstance(torchao_config, AOBaseConfig) + assert isinstance(torchao_config, AOBaseConfig), f"{torchao_config}" dummy_linear = torch.nn.Linear(param.shape[1], param.shape[0], bias=False) dummy_linear.weight = param quantize_(dummy_linear, torchao_config) From 78aa341d124e4e2162defdabde8e8b0a97ffb79d Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 14 May 2025 19:27:48 -0400 Subject: [PATCH 12/58] [CI] Fix race condition in test_kv_cache_events test (#18169) Signed-off-by: Russell Bryant --- tests/v1/engine/test_engine_core_client.py | 1 - vllm/distributed/kv_events.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 671d74b83b85..71ebd0a36e46 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -292,7 +292,6 @@ def test_kv_cache_events( log_stats=False, ) endpoint = publisher_config.endpoint.replace("*", "127.0.0.1") - time.sleep(0.1) subscriber = MockSubscriber(endpoint, topic=publisher_config.topic, decode_type=KVEventBatch) diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 1141a8e53c3b..29c6a70c4d26 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -130,6 +130,7 @@ def __init__( self._endpoint = endpoint self._replay_endpoint = replay_endpoint self._hwm = hwm + self._socket_setup() # Payload self._seq_gen = count() @@ -207,7 +208,6 @@ def _socket_setup(self) -> None: def _publisher_thread(self) -> None: """Background thread that processes the event queue.""" self._pack = msgspec.msgpack.Encoder() - self._socket_setup() assert self._pub is not None # narrows type for mypy From 2142035b51795d69d7434ce74e7aad746a2bab7a Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Wed, 14 May 2025 19:28:02 -0400 Subject: [PATCH 13/58] [V1] Support multiple kv connectors (#17564) Signed-off-by: mgoin Signed-off-by: Nick Hill Co-authored-by: Nick Hill --- .../kv_connector/unit/test_multi_connector.py | 241 ++++++++++++++++++ .../kv_transfer/kv_connector/factory.py | 5 + .../kv_transfer/kv_connector/v1/base.py | 2 - .../kv_connector/v1/multi_connector.py | 178 +++++++++++++ 4 files changed, 424 insertions(+), 2 deletions(-) create mode 100644 tests/v1/kv_connector/unit/test_multi_connector.py create mode 100644 vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py new file mode 100644 index 000000000000..64da0d79bf33 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -0,0 +1,241 @@ +# SPDX-License-Identifier: Apache-2.0 +import filecmp +import shutil +import tempfile +from collections import defaultdict +from pathlib import Path + +from vllm import LLM, SamplingParams +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) +from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa + SharedStorageConnector) + +MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" + +PROMPT_CONTEXT = "Hi " * 100 +PROMPTS = [ + PROMPT_CONTEXT + "Hello, my name is", + PROMPT_CONTEXT + "The capital of France is", +] + +SAMPLING_PARAMS = SamplingParams(temperature=0, max_tokens=20) + + +class TestSharedStorageConnector(SharedStorageConnector): + + def __init__(self, config: VllmConfig, role): + self.name = config.kv_transfer_config.kv_connector_extra_config["name"] + self._connector = SharedStorageConnector(config, role) + self.call_record: dict[str, int] = defaultdict(int) + # Use a unique temp file per connector + self._event_file = tempfile.gettempdir( + ) + f"/connector_{self.name}_events.log" + # Start with an empty file + with open(self._event_file, "w") as _: + pass + + def __getattribute__(self, name): + if name in ("_connector", "call_record", "name", "_event_file", + "__class__", "__dict__", "__getattribute__", + "__init__"): # avoid recursion + return object.__getattribute__(self, name) + if not hasattr(self._connector, name): + return object.__getattribute__(self, name) + attr = getattr(self._connector, name) + + # Intercept calls to the connector interface and write an event + # for each one to a file, which can be read back in the main test proc. + if callable(attr): + + def wrapper(*args, **kwargs): + self.call_record[name] += 1 + # Log the event as a line to the file + try: + with open(self._event_file, "a") as f: + f.write(name + "\n") + except Exception as e: + print(f"[ERROR] Could not log event {name} " + f"for {self.name}: {e}") + return attr(*args, **kwargs) + + return wrapper + return attr + + +KVConnectorFactory.register_connector("TestSharedStorageConnector", + TestSharedStorageConnector.__module__, + TestSharedStorageConnector.__name__) + + +# Helper function to compare directories recursively +def _compare_directories(dir1: Path, dir2: Path) -> bool: + """Compares two directories recursively for identical content.""" + dcmp = filecmp.dircmp(dir1, dir2) + if dcmp.left_only or dcmp.right_only or dcmp.diff_files: + print(f"Differences found between {dir1} and {dir2}:") + print(f" Left only: {dcmp.left_only}") + print(f" Right only: {dcmp.right_only}") + print(f" Different files: {dcmp.diff_files}") + return False + for sub_dir in dcmp.common_dirs: + if not _compare_directories(dir1 / sub_dir, dir2 / sub_dir): + return False + return True + + +def test_multi_shared_storage_connector_consistency(): + """ + Tests that MultiConnector with two SharedStorageConnectors saves + identical KV cache data to separate storage locations. + """ + storage_1_path = Path("storage_1/") + storage_2_path = Path("storage_2/") + shutil.rmtree(storage_1_path, ignore_errors=True) + shutil.rmtree(storage_2_path, ignore_errors=True) + storage_1_path.mkdir() + storage_2_path.mkdir() + + # Configure MultiConnector with two SharedStorageConnectors + kv_transfer_config = KVTransferConfig( + kv_connector="MultiConnector", + kv_role="kv_both", + kv_connector_extra_config={ + "connectors": [{ + "kv_connector": "TestSharedStorageConnector", + "kv_role": "kv_both", + "kv_connector_extra_config": { + "shared_storage_path": str(storage_1_path), + "name": "storage1", + } + }, { + "kv_connector": "TestSharedStorageConnector", + "kv_role": "kv_both", + "kv_connector_extra_config": { + "shared_storage_path": str(storage_2_path), + "name": "storage2", + } + }] + }, + ) + + llm = LLM( + model=MODEL_NAME, + enforce_eager=True, + gpu_memory_utilization=0.5, + kv_transfer_config=kv_transfer_config, + ) + # Run generation - this should trigger saving KV cache + _ = llm.generate(PROMPTS, SAMPLING_PARAMS) + + # --- Verification --- + + # Check that both storage directories were populated + local_subdirs = list(storage_1_path.iterdir()) + external_subdirs = list(storage_2_path.iterdir()) + + assert len( + local_subdirs + ) > 0, f"Local storage path {storage_1_path} is empty after generation." + assert len(external_subdirs) > 0, ( + f"External storage path {storage_2_path} is empty after generation.") + assert len(local_subdirs) == len(external_subdirs), ( + f"Mismatch in number of cache entries: " + f"Local={len(local_subdirs)}, External={len(external_subdirs)}") + + # The subdirectories should correspond to the prompt hashes + # Since prompts are the same, the hash directories should be the same name + local_subdir_names = sorted([d.name for d in local_subdirs]) + external_subdir_names = sorted([d.name for d in external_subdirs]) + assert local_subdir_names == external_subdir_names, ( + "Cache directory names do not match between local and external storage" + ) + + # Compare the contents of each corresponding cache directory + for subdir_name in local_subdir_names: + print(f"Comparing contents of cache directory: {subdir_name}") + assert _compare_directories(storage_1_path / subdir_name, + storage_2_path / subdir_name), \ + (f"Contents differ for cache directory '{subdir_name}' between " + f"{storage_1_path} and {storage_2_path}") + + events = get_connector_events() + # get_num_new_matched_tokens will be called on each connector in turn. + # neither of them have hits so update_state_after_alloc won't be called. + assert events["storage1"][:3] == [ + 'get_num_new_matched_tokens', 'build_connector_meta', + 'bind_connector_metadata' + ] + assert events["storage2"][:3] == [ + 'get_num_new_matched_tokens', 'build_connector_meta', + 'bind_connector_metadata' + ] + + # Reset prefix cache or else we'll just get the tokens back from there. + llm.reset_prefix_cache() + + # Run generation again - this should trigger loading from the first + # connector. + _ = llm.generate(PROMPTS, SAMPLING_PARAMS) + + events = get_connector_events() + # get_num_new_matched_tokens will return new tokens from the first + # connector so update_state_after_alloc will be called once blocks + # are allocated for the first connector. + # get_num_new_matched_tokens *won't* be called on the second connector + # in this case. + assert events["storage1"][:4] == [ + 'get_num_new_matched_tokens', 'update_state_after_alloc', + 'build_connector_meta', 'bind_connector_metadata' + ] + assert events["storage2"][:2] == [ + 'build_connector_meta', 'bind_connector_metadata' + ] + + # Delete storage1 connector state + shutil.rmtree(storage_1_path) + + # Reset prefix cache or else we'll just get the tokens back from there. + llm.reset_prefix_cache() + + # Run generation again - this should trigger loading from the first + # connector. + _ = llm.generate(PROMPTS, SAMPLING_PARAMS) + + events = get_connector_events() + # get_num_new_matched_tokens will be called for the first connector but it + # won't have a hit so update_state_after_alloc won't be called. + # get_num_new_matched_tokens will also be called on the second connector, + # but it should have a hit so update_state_after_alloc will be called. + assert events["storage1"][:3] == [ + 'get_num_new_matched_tokens', 'build_connector_meta', + 'bind_connector_metadata' + ] + assert events["storage2"][:4] == [ + 'get_num_new_matched_tokens', 'update_state_after_alloc', + 'build_connector_meta', 'bind_connector_metadata' + ] + + # Clean up + shutil.rmtree(storage_1_path) + shutil.rmtree(storage_2_path) + + +def get_connector_events() -> dict[str, list[str]]: + # Read in connector events and reset the files. + import glob + event_files = glob.glob(tempfile.gettempdir() + "/connector_*_events.log") + connector_events = {} + for fname in event_files: + name = fname.split("connector_")[1].split("_events.log")[0] + try: + with open(fname, "r+") as f: + connector_events[name] = [ + line.strip() for line in f if line.strip() + ] + f.truncate(0) + except Exception as e: + print(f"[ERROR] Could not read connector events for {name}: {e}") + + return connector_events diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 6766d5a24542..f998f5dd7b15 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -110,3 +110,8 @@ def create_connector_v1( "NixlConnector", "vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector", "NixlConnector") + +KVConnectorFactory.register_connector( + "MultiConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.multi_connector", + "MultiConnector") diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 03c99f20e775..9fdb5340f0e2 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -22,7 +22,6 @@ import enum from abc import ABC, abstractmethod -from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Optional import torch @@ -48,7 +47,6 @@ class KVConnectorRole(enum.Enum): WORKER = 1 -@dataclass class KVConnectorMetadata: """ Abstract Metadata used to communicate between the diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py new file mode 100644 index 000000000000..cc4a7fbadf5c --- /dev/null +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: Apache-2.0 +import copy +from typing import TYPE_CHECKING, Any, Optional + +import torch + +from vllm.config import KVTransferConfig, VllmConfig +from vllm.distributed.kv_transfer.kv_connector.factory import ( + KVConnectorFactory) +from vllm.distributed.kv_transfer.kv_connector.v1.base import ( + KVConnectorBase_V1, KVConnectorMetadata, KVConnectorRole) +from vllm.logger import init_logger +from vllm.v1.core.sched.output import SchedulerOutput + +if TYPE_CHECKING: + from vllm.attention.backends.abstract import AttentionMetadata + from vllm.forward_context import ForwardContext + from vllm.v1.core.kv_cache_manager import KVCacheBlocks + from vllm.v1.request import Request + +logger = init_logger(__name__) + + +class MultiKVConnectorMetadata(tuple[KVConnectorMetadata, ...], + KVConnectorMetadata): + pass + + +class MultiConnector(KVConnectorBase_V1): + """ + A wrapper for using multiple KVConnectors at the same time. + + The current logic is: + - Load KV from the first connector that advertises available tokens from + get_num_new_matched_tokens(), based on the order in the config. + - Save to all connectors. + """ + + def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): + super().__init__(vllm_config=vllm_config, role=role) + self._connectors = [] + ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get( + "connectors") + assert ktcs is not None + for ktc in ktcs: + temp_config = copy.copy(vllm_config) + temp_config.kv_transfer_config = KVTransferConfig(**ktc) + self._connectors.append( + KVConnectorFactory.create_connector_v1(temp_config, role)) + + # A mapping from request id to the connector that is assigned to it. + self._requests_to_connector: dict[str, KVConnectorBase_V1] = {} + + # Keeps track of *additional* remaining async saves (beyond 1) to be + # finished per request. Not needed for async loads since we only allow + # a single connector to load. + self._extra_async_saves: dict[str, int] = {} + + def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]): + for c in self._connectors: + c.register_kv_caches(kv_caches) + + # We must override the base class method here because we need to bind + # the metadata to each connector in the order of the connectors in the + # MultiKVConnectorMetadata. + def bind_connector_metadata( + self, connector_metadata: KVConnectorMetadata) -> None: + assert isinstance(connector_metadata, MultiKVConnectorMetadata) + for c, cm in zip(self._connectors, connector_metadata): + c.bind_connector_metadata(cm) + + def clear_connector_metadata(self) -> None: + for c in self._connectors: + c.clear_connector_metadata() + + # ============================== + # Worker-side methods + # ============================== + def start_load_kv(self, forward_context: "ForwardContext", + **kwargs) -> None: + for c in self._connectors: + c.start_load_kv(forward_context, **kwargs) + + def wait_for_layer_load(self, layer_name: str) -> None: + for c in self._connectors: + c.wait_for_layer_load(layer_name) + + def save_kv_layer(self, layer_name: str, kv_layer: torch.Tensor, + attn_metadata: "AttentionMetadata", **kwargs) -> None: + for c in self._connectors: + c.save_kv_layer(layer_name, kv_layer, attn_metadata, **kwargs) + + def wait_for_save(self): + for c in self._connectors: + c.wait_for_save() + + def get_finished( + self, finished_req_ids: set[str] + ) -> tuple[Optional[set[str]], Optional[set[str]]]: + finished_recving: set[str] = set() + finished_sending: set[str] = set() + for c in self._connectors: + recving, sending = c.get_finished(finished_req_ids) + if not recving and not sending: + continue + # Aggregate finished recving request ids. + finished_recving.update(recving or ()) + # Aggregate finished sending request ids - only include + # once we've drained the "extra" count (for cases where + # more than one connector is async-saving the same request). + for req_id in sending or (): + extra_pending = self._extra_async_saves.get(req_id) + if extra_pending is None: + finished_sending.add(req_id) + continue + assert extra_pending > 0 + if extra_pending == 1: + del self._extra_async_saves[req_id] + else: + self._extra_async_saves[req_id] = extra_pending - 1 + + return finished_recving or None, finished_sending or None + + # ============================== + # Scheduler-side methods + # ============================== + def get_num_new_matched_tokens( + self, + request: "Request", + num_computed_tokens: int, + ) -> tuple[int, bool]: + for c in self._connectors: + toks, load_async = c.get_num_new_matched_tokens( + request, num_computed_tokens) + # The first connector that has new matched tokens will be assigned + # to this request. + if toks > 0: + self._requests_to_connector[request.request_id] = c + return toks, load_async + return 0, False + + def update_state_after_alloc(self, request: "Request", + blocks: "KVCacheBlocks", + num_external_tokens: int): + # If the request is not assigned to any connector, we do nothing. + if request.request_id not in self._requests_to_connector: + return + # We assume that the request is assigned to only one connector. + c = self._requests_to_connector.pop(request.request_id) + c.update_state_after_alloc(request, blocks, num_external_tokens) + + def build_connector_meta( + self, + scheduler_output: SchedulerOutput) -> MultiKVConnectorMetadata: + return MultiKVConnectorMetadata( + c.build_connector_meta(scheduler_output) for c in self._connectors) + + def request_finished( + self, + request: "Request", + blocks: "KVCacheBlocks", + ) -> tuple[bool, Optional[dict[str, Any]]]: + async_saves = 0 + kv_txfer_params = None + for c in self._connectors: + async_save, txfer_params = c.request_finished(request, blocks) + if async_save: + async_saves += 1 + if txfer_params is not None: + if kv_txfer_params is not None: + #TODO we can probably change this to merge the dicts here, + # checking for key clashes. + raise RuntimeError( + "Only one connector can produce KV transfer params") + kv_txfer_params = txfer_params + if async_saves > 1: + self._extra_async_saves[request.request_id] = async_saves - 1 + return async_saves > 0, kv_txfer_params From 09f106a91e1a90f6d703571159a97db9783bb7f7 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 14 May 2025 16:35:56 -0700 Subject: [PATCH 14/58] Upload vllm index for the rc builds (#18173) --- .buildkite/scripts/upload-wheels.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 75e3ef264095..037897e53dbe 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -75,3 +75,4 @@ else fi aws s3 cp "$wheel" "s3://vllm-wheels/$version/" +aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html" From f25e0d1125f873201ae880b50df46a9e3d29f3ba Mon Sep 17 00:00:00 2001 From: David Xia Date: Wed, 14 May 2025 20:04:35 -0400 Subject: [PATCH 15/58] [Bugfix]: make most of `test_openai_schema.py` pass (#17664) --- vllm/entrypoints/openai/api_server.py | 271 +++++++++++++++--- vllm/entrypoints/openai/serving_chat.py | 2 +- .../openai/serving_tokenization.py | 2 +- 3 files changed, 240 insertions(+), 35 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 5b3df0faccf6..0ab6fcdca1a4 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -17,8 +17,10 @@ from contextlib import asynccontextmanager from functools import partial from http import HTTPStatus +from json import JSONDecodeError from typing import Annotated, Optional, Union +import prometheus_client import uvloop from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request from fastapi.exceptions import RequestValidationError @@ -305,15 +307,18 @@ async def validate_json_request(raw_request: Request): content_type = raw_request.headers.get("content-type", "").lower() media_type = content_type.split(";", maxsplit=1)[0] if media_type != "application/json": - raise HTTPException( - status_code=HTTPStatus.UNSUPPORTED_MEDIA_TYPE, - detail="Unsupported Media Type: Only 'application/json' is allowed" - ) + raise RequestValidationError(errors=[ + "Unsupported Media Type: Only 'application/json' is allowed" + ]) router = APIRouter() +class PrometheusResponse(Response): + media_type = prometheus_client.CONTENT_TYPE_LATEST + + def mount_metrics(app: FastAPI): # Lazy import for prometheus multiprocessing. # We need to set PROMETHEUS_MULTIPROC_DIR environment variable @@ -332,6 +337,10 @@ def mount_metrics(app: FastAPI): registry = CollectorRegistry() multiprocess.MultiProcessCollector(registry) + # `response_class=PrometheusResponse` is needed to return an HTTP response + # with header "Content-Type: text/plain; version=0.0.4; charset=utf-8" + # instead of the default "application/json" which is incorrect. + # See https://github.com/trallnag/prometheus-fastapi-instrumentator/issues/163#issue-1296092364 Instrumentator( excluded_handlers=[ "/metrics", @@ -342,7 +351,7 @@ def mount_metrics(app: FastAPI): "/server_info", ], registry=registry, - ).add().instrument(app).expose(app) + ).add().instrument(app).expose(app, response_class=PrometheusResponse) # Add prometheus asgi middleware to route /metrics requests metrics_route = Mount("/metrics", make_asgi_app(registry=registry)) @@ -401,11 +410,11 @@ def engine_client(request: Request) -> EngineClient: return request.app.state.engine_client -@router.get("/health") -async def health(raw_request: Request) -> JSONResponse: +@router.get("/health", response_class=Response) +async def health(raw_request: Request) -> Response: """Health check.""" await engine_client(raw_request).check_health() - return JSONResponse(content={}, status_code=200) + return Response(status_code=200) @router.get("/load") @@ -427,18 +436,42 @@ async def get_server_load_metrics(request: Request): content={'server_load': request.app.state.server_load_metrics}) -@router.api_route("/ping", methods=["GET", "POST"]) -async def ping(raw_request: Request) -> JSONResponse: +@router.get("/ping", response_class=Response) +@router.post("/ping", response_class=Response) +async def ping(raw_request: Request) -> Response: """Ping check. Endpoint required for SageMaker""" return await health(raw_request) -@router.post("/tokenize", dependencies=[Depends(validate_json_request)]) +@router.post("/tokenize", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_FOUND.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_IMPLEMENTED.value: { + "model": ErrorResponse + }, + }) @with_cancellation async def tokenize(request: TokenizeRequest, raw_request: Request): handler = tokenization(raw_request) - generator = await handler.create_tokenize(request, raw_request) + try: + generator = await handler.create_tokenize(request, raw_request) + except NotImplementedError as e: + raise HTTPException(status_code=HTTPStatus.NOT_IMPLEMENTED.value, + detail=str(e)) from e + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e + if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -448,12 +481,31 @@ async def tokenize(request: TokenizeRequest, raw_request: Request): assert_never(generator) -@router.post("/detokenize", dependencies=[Depends(validate_json_request)]) +@router.post("/detokenize", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_FOUND.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation async def detokenize(request: DetokenizeRequest, raw_request: Request): handler = tokenization(raw_request) - generator = await handler.create_detokenize(request, raw_request) + try: + generator = await handler.create_detokenize(request, raw_request) + except OverflowError as e: + raise RequestValidationError(errors=[str(e)]) from e + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e + if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -478,7 +530,23 @@ async def show_version(): @router.post("/v1/chat/completions", - dependencies=[Depends(validate_json_request)]) + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: { + "content": { + "text/event-stream": {} + } + }, + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_FOUND.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + } + }) @with_cancellation @load_aware_call async def create_chat_completion(request: ChatCompletionRequest, @@ -500,7 +568,24 @@ async def create_chat_completion(request: ChatCompletionRequest, return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post("/v1/completions", dependencies=[Depends(validate_json_request)]) +@router.post("/v1/completions", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.OK.value: { + "content": { + "text/event-stream": {} + } + }, + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.NOT_FOUND.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_completion(request: CompletionRequest, raw_request: Request): @@ -509,7 +594,15 @@ async def create_completion(request: CompletionRequest, raw_request: Request): return base(raw_request).create_error_response( message="The model does not support Completions API") - generator = await handler.create_completion(request, raw_request) + try: + generator = await handler.create_completion(request, raw_request) + except OverflowError as e: + raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value, + detail=str(e)) from e + except Exception as e: + raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR.value, + detail=str(e)) from e + if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -519,7 +612,16 @@ async def create_completion(request: CompletionRequest, raw_request: Request): return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post("/v1/embeddings", dependencies=[Depends(validate_json_request)]) +@router.post("/v1/embeddings", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_embedding(request: EmbeddingRequest, raw_request: Request): @@ -566,7 +668,16 @@ async def create_embedding(request: EmbeddingRequest, raw_request: Request): assert_never(generator) -@router.post("/pooling", dependencies=[Depends(validate_json_request)]) +@router.post("/pooling", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_pooling(request: PoolingRequest, raw_request: Request): @@ -606,7 +717,16 @@ async def create_classify(request: ClassificationRequest, assert_never(generator) -@router.post("/score", dependencies=[Depends(validate_json_request)]) +@router.post("/score", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_score(request: ScoreRequest, raw_request: Request): @@ -625,7 +745,16 @@ async def create_score(request: ScoreRequest, raw_request: Request): assert_never(generator) -@router.post("/v1/score", dependencies=[Depends(validate_json_request)]) +@router.post("/v1/score", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def create_score_v1(request: ScoreRequest, raw_request: Request): @@ -636,12 +765,28 @@ async def create_score_v1(request: ScoreRequest, raw_request: Request): return await create_score(request, raw_request) -@router.post("/v1/audio/transcriptions") +@router.post("/v1/audio/transcriptions", + responses={ + HTTPStatus.OK.value: { + "content": { + "text/event-stream": {} + } + }, + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.UNPROCESSABLE_ENTITY.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call -async def create_transcriptions(request: Annotated[TranscriptionRequest, - Form()], - raw_request: Request): +async def create_transcriptions(raw_request: Request, + request: Annotated[TranscriptionRequest, + Form()]): handler = transcription(raw_request) if handler is None: return base(raw_request).create_error_response( @@ -661,7 +806,16 @@ async def create_transcriptions(request: Annotated[TranscriptionRequest, return StreamingResponse(content=generator, media_type="text/event-stream") -@router.post("/rerank", dependencies=[Depends(validate_json_request)]) +@router.post("/rerank", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation @load_aware_call async def do_rerank(request: RerankRequest, raw_request: Request): @@ -679,7 +833,16 @@ async def do_rerank(request: RerankRequest, raw_request: Request): assert_never(generator) -@router.post("/v1/rerank", dependencies=[Depends(validate_json_request)]) +@router.post("/v1/rerank", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation async def do_rerank_v1(request: RerankRequest, raw_request: Request): logger.warning_once( @@ -690,7 +853,16 @@ async def do_rerank_v1(request: RerankRequest, raw_request: Request): return await do_rerank(request, raw_request) -@router.post("/v2/rerank", dependencies=[Depends(validate_json_request)]) +@router.post("/v2/rerank", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) @with_cancellation async def do_rerank_v2(request: RerankRequest, raw_request: Request): return await do_rerank(request, raw_request) @@ -770,12 +942,29 @@ async def is_sleeping(raw_request: Request): return JSONResponse(content={"is_sleeping": is_sleeping}) -@router.post("/invocations", dependencies=[Depends(validate_json_request)]) +@router.post("/invocations", + dependencies=[Depends(validate_json_request)], + responses={ + HTTPStatus.BAD_REQUEST.value: { + "model": ErrorResponse + }, + HTTPStatus.UNSUPPORTED_MEDIA_TYPE.value: { + "model": ErrorResponse + }, + HTTPStatus.INTERNAL_SERVER_ERROR.value: { + "model": ErrorResponse + }, + }) async def invocations(raw_request: Request): """ For SageMaker, routes requests to other handlers based on model `task`. """ - body = await raw_request.json() + try: + body = await raw_request.json() + except JSONDecodeError as e: + raise HTTPException(status_code=HTTPStatus.BAD_REQUEST.value, + detail=f"JSON decode error: {e}") from e + task = raw_request.app.state.task if task not in TASK_HANDLERS: @@ -866,10 +1055,26 @@ def build_app(args: Namespace) -> FastAPI: allow_headers=args.allowed_headers, ) + @app.exception_handler(HTTPException) + async def http_exception_handler(_: Request, exc: HTTPException): + err = ErrorResponse(message=exc.detail, + type=HTTPStatus(exc.status_code).phrase, + code=exc.status_code) + return JSONResponse(err.model_dump(), status_code=exc.status_code) + @app.exception_handler(RequestValidationError) - async def validation_exception_handler(_, exc): - err = ErrorResponse(message=str(exc), - type="BadRequestError", + async def validation_exception_handler(_: Request, + exc: RequestValidationError): + exc_str = str(exc) + errors_str = str(exc.errors()) + + if exc.errors() and errors_str and errors_str != exc_str: + message = f"{exc_str} {errors_str}" + else: + message = exc_str + + err = ErrorResponse(message=message, + type=HTTPStatus.BAD_REQUEST.phrase, code=HTTPStatus.BAD_REQUEST) return JSONResponse(err.model_dump(), status_code=HTTPStatus.BAD_REQUEST) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index a9ba0e4d68ce..ee18e0b0a454 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -197,7 +197,7 @@ async def create_chat_completion( except (ValueError, TypeError, RuntimeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(str(e)) + return self.create_error_response(f"{e} {e.__cause__}") request_id = "chatcmpl-" \ f"{self._base_request_id(raw_request, request.request_id)}" diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index c642fc51005e..5f4678cb0e69 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -91,7 +91,7 @@ async def create_tokenize( ) except (ValueError, TypeError, jinja2.TemplateError) as e: logger.exception("Error in preprocessing prompt inputs") - return self.create_error_response(str(e)) + return self.create_error_response(f"{e} {e.__cause__}") input_ids: list[int] = [] for i, engine_prompt in enumerate(engine_prompts): From e60f550b3825cbce2d3c7e882b029e2c1d914d8d Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Thu, 15 May 2025 09:54:54 +0800 Subject: [PATCH 16/58] [v1] Support multiple KV cache groups in GPU model runner (#17945) Signed-off-by: Chen Zhang --- tests/v1/core/test_kv_cache_utils.py | 71 ++++- tests/v1/core/test_prefix_caching.py | 36 +-- tests/v1/worker/test_gpu_input_batch.py | 39 ++- tests/v1/worker/test_gpu_model_runner.py | 57 ++-- tests/weight_loading/models.txt | 2 +- .../v1/shared_storage_connector.py | 6 +- .../attention/backends/mla/rocm_aiter_mla.py | 4 +- vllm/v1/core/kv_cache_manager.py | 34 ++- vllm/v1/core/kv_cache_utils.py | 13 +- vllm/v1/core/sched/output.py | 12 +- vllm/v1/core/sched/scheduler.py | 16 +- vllm/v1/kv_cache_interface.py | 42 +++ vllm/v1/worker/block_table.py | 47 +++ vllm/v1/worker/gpu_input_batch.py | 13 +- vllm/v1/worker/gpu_model_runner.py | 270 ++++++++++-------- vllm/v1/worker/tpu_model_runner.py | 35 +-- 16 files changed, 482 insertions(+), 215 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index 1cdc80dd3546..e572100fe7a1 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -19,7 +19,8 @@ hash_request_tokens, unify_kv_cache_configs) from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, - KVCacheGroupSpec, KVCacheTensor) + KVCacheGroupSpec, KVCacheTensor, + SlidingWindowSpec) from vllm.v1.metrics.stats import PrefixCacheStats from vllm.v1.request import Request @@ -54,12 +55,14 @@ def new_kv_cache_spec(block_size=16, num_kv_heads=2, head_size=64, dtype=torch.float32, - use_mla=False): + use_mla=False, + sliding_window=None): return FullAttentionSpec(block_size=block_size, num_kv_heads=num_kv_heads, head_size=head_size, dtype=dtype, - use_mla=use_mla) + use_mla=use_mla, + sliding_window=sliding_window) def test_none_hash(): @@ -471,6 +474,68 @@ def test_unify_kv_cache_configs(): unify_kv_cache_configs(diff_kv_cache_config) +def test_merge_kv_cache_spec(): + same_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32), + new_kv_cache_spec(num_kv_heads=32), + ] + merged_layer_spec = same_layer_specs[0].merge(same_layer_specs) + assert merged_layer_spec.block_size == 16 + assert merged_layer_spec.num_kv_heads == 32 + assert merged_layer_spec.head_size == 64 + assert merged_layer_spec.dtype == torch.float32 + assert merged_layer_spec.sliding_window is None + + different_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32), + new_kv_cache_spec(num_kv_heads=16), + ] + with pytest.raises(AssertionError): + different_layer_specs[0].merge(different_layer_specs) + + full_spec = new_kv_cache_spec(num_kv_heads=32) + different_type_layer_specs = [ + full_spec, + SlidingWindowSpec( + block_size=full_spec.block_size, + num_kv_heads=full_spec.num_kv_heads, + head_size=full_spec.head_size, + dtype=full_spec.dtype, + use_mla=full_spec.use_mla, + sliding_window=1, + ), + ] + with pytest.raises(AssertionError): + different_type_layer_specs[0].merge(different_type_layer_specs) + with pytest.raises(AssertionError): + different_type_layer_specs[1].merge(different_type_layer_specs) + + different_sliding_window_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32), + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + new_kv_cache_spec(num_kv_heads=32, sliding_window=2), + ] + with pytest.raises(ValueError): + different_sliding_window_layer_specs[0].merge( + different_sliding_window_layer_specs) + + same_sliding_window_layer_specs = [ + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + ] + merged_layer_spec = same_sliding_window_layer_specs[0].merge( + same_sliding_window_layer_specs) + assert merged_layer_spec.sliding_window == 1 + + same_sliding_window_layer_spec_with_none = [ + new_kv_cache_spec(num_kv_heads=32, sliding_window=1), + new_kv_cache_spec(num_kv_heads=32, sliding_window=None), + ] + merged_layer_spec = same_sliding_window_layer_spec_with_none[0].merge( + same_sliding_window_layer_spec_with_none) + assert merged_layer_spec.sliding_window == 1 + + @pytest.mark.parametrize( ("model_id", "max_model_len", "want_estimated_max_len"), [ ("Qwen/Qwen1.5-7B", 16385, 16384), diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py index 2d7411381e16..3da27786b1f2 100644 --- a/tests/v1/core/test_prefix_caching.py +++ b/tests/v1/core/test_prefix_caching.py @@ -84,7 +84,7 @@ def test_prefill(hash_algo): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] # Check full block metadata parent_block_hash = None @@ -107,13 +107,13 @@ def test_prefill(hash_algo): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [1, 2, 3] + assert computed_blocks.get_block_ids() == [[1, 2, 3]] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [5] + assert blocks.get_block_ids() == [[5]] for block in computed_blocks.blocks: assert block.ref_cnt == 2 @@ -141,13 +141,13 @@ def test_prefill(hash_algo): req2 = make_request("2", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) assert len(manager.req_to_block_hashes[req2.request_id]) == 3 - assert computed_blocks.get_block_ids() == [1, 2, 3] + assert computed_blocks.get_block_ids() == [[1, 2, 3]] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req2, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [6] + assert blocks.get_block_ids() == [[6]] # Although we only have 6 free blocks, we have 8 blocks in # the free block queue due to lazy removal. @@ -171,7 +171,7 @@ def test_prefill(hash_algo): len(computed_blocks.blocks) * 16, computed_blocks) # This block ID order also checks the eviction order. - assert blocks.get_block_ids() == [7, 8, 9, 10, 4, 5, 6, 3, 2, 1] + assert blocks.get_block_ids() == [[7, 8, 9, 10, 4, 5, 6, 3, 2, 1]] assert manager.block_pool.free_block_queue.num_free_blocks == 0 assert manager.block_pool.free_block_queue.free_list_head is None assert manager.block_pool.free_block_queue.free_list_tail is None @@ -208,7 +208,7 @@ def test_prefill_plp(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] req0_block_hashes = [b.block_hash for b in blocks.blocks] # Check full block metadata @@ -233,13 +233,13 @@ def test_prefill_plp(): req1 = make_request("1", common_token_ids + unique_token_ids) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req1) assert len(manager.req_to_block_hashes[req1.request_id]) == 3 - assert computed_blocks.get_block_ids() == [1, 2, 3] + assert computed_blocks.get_block_ids() == [[1, 2, 3]] assert num_computed_tokens == 3 * 16 num_new_tokens = 53 - 3 * 16 blocks = manager.allocate_slots(req1, num_new_tokens, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [5] + assert blocks.get_block_ids() == [[5]] for block in computed_blocks.blocks: assert block.ref_cnt == 2 @@ -277,11 +277,11 @@ def test_prefill_plp(): block_ids = blocks.get_block_ids() # Duplicate cached blocks have different ids but same hashes vs request #0 assert [b.block_hash for b in blocks.blocks] == req0_block_hashes - assert block_ids != [1, 2, 3, 4] + assert block_ids != [[1, 2, 3, 4]] # Request #2 block hashes are valid since request #0 hashes are. # Check block reference counts. - for block_id in block_ids: + for block_id in block_ids[0]: assert manager.block_pool.blocks[block_id].ref_cnt == 1 manager.free(req2) @@ -307,7 +307,7 @@ def test_decode(): blocks = manager.allocate_slots(req0, 55, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] # Append slots without allocating a new block. req0.num_computed_tokens = 55 @@ -379,12 +379,12 @@ def test_evict(): # Touch the first 2 blocks. req2 = make_request("2", list(range(2 * 16 + 3))) computed_blocks, num_computed_tokens = manager.get_computed_blocks(req2) - assert computed_blocks.get_block_ids() == [1, 2] + assert computed_blocks.get_block_ids() == [[1, 2]] assert num_computed_tokens == 2 * 16 blocks = manager.allocate_slots(req2, 3, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [10] + assert blocks.get_block_ids() == [[10]] assert manager.block_pool.free_block_queue.num_free_blocks == 7 @@ -625,7 +625,7 @@ def test_mm_prefix_caching(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -686,7 +686,7 @@ def test_cache_key_salting(): blocks = manager.allocate_slots(req0, 59, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] req0.num_computed_tokens = 59 # Append slots without allocating a new block. @@ -797,7 +797,7 @@ def test_reset_prefix_cache(): all_token_ids = full_block_token_ids + unique_token_ids req0 = make_request("0", all_token_ids) blocks = manager.allocate_slots(req0, 55) - assert blocks.get_block_ids() == [1, 2, 3, 4] + assert blocks.get_block_ids() == [[1, 2, 3, 4]] unique_token_ids = [4] * 7 all_token_ids = full_block_token_ids + unique_token_ids @@ -808,7 +808,7 @@ def test_reset_prefix_cache(): blocks = manager.allocate_slots(req1, 7, len(computed_blocks.blocks) * 16, computed_blocks) - assert blocks.get_block_ids() == [5] + assert blocks.get_block_ids() == [[5]] # Failed to reset prefix cache because some blocks are not freed yet. assert not manager.reset_prefix_cache() diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py index 7b1359c8576f..638f5bedcfca 100644 --- a/tests/v1/worker/test_gpu_input_batch.py +++ b/tests/v1/worker/test_gpu_input_batch.py @@ -9,9 +9,11 @@ from vllm.sampling_params import SamplingParams from vllm.utils import is_pin_memory_available, make_tensor_with_pad +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec, KVCacheTensor) from vllm.v1.sample.metadata import SamplingMetadata -from vllm.v1.worker.gpu_input_batch import (BlockTable, CachedRequestState, - InputBatch) +from vllm.v1.worker.block_table import BlockTable, MultiGroupBlockTable +from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch VOCAB_SIZE = 1024 NUM_OUTPUT_TOKENS = 20 @@ -22,6 +24,27 @@ MAX_NUM_PROMPT_TOKENS = 64 +def get_kv_cache_config() -> KVCacheConfig: + return KVCacheConfig( + num_blocks=10, + tensors={ + "layer.0": KVCacheTensor(size=1024), + }, + kv_cache_groups=[ + KVCacheGroupSpec( + layer_names=["layer.0"], + kv_cache_spec=FullAttentionSpec( + block_size=1, + num_kv_heads=1, + head_size=16, + dtype=torch.float16, + use_mla=False, + ), + ), + ], + ) + + def _compare_objs(obj1, obj2): attrs = inspect.getmembers(obj1, lambda a: not (inspect.isroutine(a))) attr_names = set([ @@ -41,6 +64,10 @@ def _compare_objs(obj1, obj2): elif isinstance(a, np.ndarray): if np.allclose(a, b): is_same = True + elif isinstance(a, MultiGroupBlockTable): + for a_i, b_i in zip(a.block_tables, b.block_tables): + _compare_objs(a_i, b_i) + is_same = True elif isinstance(a, (BlockTable, SamplingMetadata)): _compare_objs(a, b) is_same = True # if we make it here must be same @@ -198,7 +225,7 @@ def _construct_cached_request_state(req_id_suffix: int): sampling_params=_create_sampling_params(), mm_inputs=[], mm_positions=[], - block_ids=[], + block_ids=[[]], generator=None, num_computed_tokens=len(output_token_ids), output_token_ids=output_token_ids, @@ -220,11 +247,11 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int): input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, - max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, + kv_cache_config=get_kv_cache_config(), ) reqs: list[CachedRequestState] = [] req_id_reqs = {} @@ -310,20 +337,20 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, - max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, + kv_cache_config=get_kv_cache_config(), ) ref_input_batch: InputBatch = InputBatch( max_num_reqs=batch_size, max_model_len=1024, - max_num_blocks_per_req=10, max_num_batched_tokens=1024, device=torch.device(device), pin_memory=is_pin_memory_available(), vocab_size=1024, + kv_cache_config=get_kv_cache_config(), ) reqs: list[CachedRequestState] = [] diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 725747294fd8..e44660525763 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -1,15 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 -import weakref import pytest -import torch -from vllm.config import CacheConfig, ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, + SchedulerConfig, VllmConfig) from vllm.sampling_params import SamplingParams from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput) -from vllm.v1.kv_cache_interface import FullAttentionSpec +from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, + KVCacheGroupSpec, KVCacheTensor) from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.worker.gpu_input_batch import InputBatch from vllm.v1.worker.gpu_model_runner import GPUModelRunner @@ -17,13 +18,34 @@ def initialize_kv_cache(runner: GPUModelRunner): """ Only perform necessary steps in GPUModelRunner.initialize_kv_cache() """ - kv_cache_spec = FullAttentionSpec(block_size=16, - num_kv_heads=1, - head_size=64, - dtype=torch.float16, - use_mla=False) - runner.attn_metadata_builder = runner.attn_backend.get_builder_cls()( - weakref.proxy(runner), kv_cache_spec, runner.input_batch.block_table) + kv_cache_config = KVCacheConfig( + num_blocks=10, + tensors={ + "layer.0": KVCacheTensor(size=1024), + }, + kv_cache_groups=[ + KVCacheGroupSpec( + layer_names=["layer.0"], + kv_cache_spec=FullAttentionSpec( + block_size=16, + num_kv_heads=runner.model_config.get_num_kv_heads( + runner.parallel_config), + head_size=runner.model_config.get_head_size(), + dtype=runner.kv_cache_dtype, + use_mla=False, + )) + ]) + runner.kv_cache_config = kv_cache_config + runner.input_batch = InputBatch( + max_num_reqs=runner.max_num_reqs, + max_model_len=runner.max_model_len, + max_num_batched_tokens=runner.max_num_tokens, + device=runner.device, + pin_memory=runner.pin_memory, + vocab_size=runner.model_config.get_vocab_size(), + kv_cache_config=kv_cache_config, + ) + runner.initialize_attn_backend(kv_cache_config) @pytest.fixture @@ -48,10 +70,12 @@ def model_runner(): swap_space=0, cache_dtype="auto", ) + parallel_config = ParallelConfig() vllm_config = VllmConfig( model_config=model_config, cache_config=cache_config, scheduler_config=scheduler_config, + parallel_config=parallel_config, ) device = "cuda" @@ -73,7 +97,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput: mm_hashes=[], mm_positions=[], sampling_params=SamplingParams(), - block_ids=[0], + block_ids=[[0]], num_computed_tokens=0, lora_request=None, )) @@ -111,13 +135,14 @@ def _is_sampling_metadata_changed(model_runner, def _is_req_state_block_table_match(model_runner, req_id: str) -> bool: req_index = model_runner.input_batch.req_id_to_index[req_id] - block_table = model_runner.input_batch.block_table + block_table = model_runner.input_batch.block_table[0] req_state = model_runner.requests[req_id] - if block_table.num_blocks_per_row[req_index] != len(req_state.block_ids): + if block_table.num_blocks_per_row[req_index] != len( + req_state.block_ids[0]): return False num_blocks = block_table.num_blocks_per_row[req_index] return (block_table.block_table_np[req_index, :num_blocks] == - req_state.block_ids).all() + req_state.block_ids[0]).all() def test_update_states_new_request(model_runner): @@ -200,7 +225,7 @@ def test_update_states_request_resumed(model_runner): req_id=req_id, resumed_from_preemption=False, new_token_ids=[], - new_block_ids=[], + new_block_ids=[[]], num_computed_tokens=0, ) diff --git a/tests/weight_loading/models.txt b/tests/weight_loading/models.txt index 1b797074096e..9164f8595346 100644 --- a/tests/weight_loading/models.txt +++ b/tests/weight_loading/models.txt @@ -2,7 +2,7 @@ gptq_marlin, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main gptq_marlin, TheBloke/Llama-2-7B-GPTQ, main gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, main gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit--1g-actorder_True -gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True +#gptq_marlin, TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ, gptq-8bit-32g-actorder_True gptq_marlin, TechxGenus/gemma-1.1-2b-it-GPTQ, main gptq, robertgshaw2/zephyr-7b-beta-channelwise-gptq, main gptq, TheBloke/Llama-2-7B-GPTQ, main diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py index 0fedb6fd5ed9..0421a65a2c81 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py @@ -288,7 +288,7 @@ def build_connector_meta( for new_req in scheduler_output.scheduled_new_reqs: if new_req.req_id in self._requests_need_load: meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, + block_ids=new_req.block_ids[0], block_size=self._block_size, is_store=False) total_need_load += 1 @@ -299,7 +299,7 @@ def build_connector_meta( # the original prompt tokens. if not self._found_match_for_request(new_req): meta.add_request(token_ids=new_req.prompt_token_ids, - block_ids=new_req.block_ids, + block_ids=new_req.block_ids[0], block_size=self._block_size, is_store=True) @@ -319,7 +319,7 @@ def build_connector_meta( # NOTE(rob): For resumed req, new_block_ids is all # of the block_ids for the request. - block_ids = cached_req.new_block_ids + block_ids = cached_req.new_block_ids[0] meta.add_request(token_ids=token_ids, block_ids=block_ids, diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index 3abb185c5b8f..7ce39110ac01 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -67,13 +67,13 @@ def __init__(self, runner, kv_cache_spec: AttentionSpec, max_model_len = self.runner.model_config.max_model_len assert max_model_len == 32768,\ "AITER MLA requires max_model_len=32768" - assert self.runner.block_size == 1, "AITER MLA" \ + assert self.kv_cache_spec.block_size == 1, "AITER MLA" \ "only supports block size 1." def _get_paged_kv_tensors( self, block_table: torch.Tensor, seq_lens: torch.Tensor) -> tuple[torch.Tensor, ...]: - page_size = self.runner.block_size + page_size = self.kv_cache_spec.block_size block_table_bounds = (seq_lens + page_size - 1) // page_size mask = (torch.arange(block_table.size(1), diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 598fc871110e..da18ece7555a 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -32,9 +32,16 @@ def create_empty(cls) -> "KVCacheBlocks": """Creates a new KVCacheBlocks instance with no blocks.""" return cls([]) - def get_block_ids(self) -> list[int]: - """Converts the KVCacheBlocks instance to a list of block IDs.""" - return [block.block_id for block in self.blocks] + def get_block_ids(self) -> list[list[int]]: + """ + Converts the KVCacheBlocks instance to block_ids. + + Returns: + list[list[int]]: A two-level list where + * the outer list corresponds to KV cache groups (only 1 group now) + * each inner list contains the block_ids of the blocks in that group + """ + return [[block.block_id for block in self.blocks]] def get_unhashed_block_ids(self) -> list[int]: """Get block_ids of unhashed blocks from KVCacheBlocks instance.""" @@ -300,9 +307,9 @@ def get_num_common_prefix_blocks( self, request: Request, num_running_requests: int, - ) -> int: + ) -> list[int]: """Calculate the number of common prefix blocks shared by all requests - in the RUNNING state. + in the RUNNING state for each kv cache group. The function determines this by selecting any request and iterating through its blocks. A block is considered a common prefix block if its @@ -332,11 +339,14 @@ def get_num_common_prefix_blocks( requests in the current step. Returns: - int: The number of common prefix blocks. + list[int]: The number of common prefix blocks for each kv cache + group. """ assert request.status == RequestStatus.RUNNING - return self.single_type_manager.get_num_common_prefix_blocks( - request.request_id, num_running_requests) + return [ + self.single_type_manager.get_num_common_prefix_blocks( + request.request_id, num_running_requests) + ] def free_block_hashes(self, request: Request) -> None: """Discard the block hashes for the request. @@ -354,10 +364,8 @@ def take_events(self) -> list[KVCacheEvent]: """ return self.block_pool.take_events() - def get_block_ids(self, request_id: str) -> list[int]: + def get_block_ids(self, request_id: str) -> list[list[int]]: """Get the block ids of a request.""" assert request_id in self.single_type_manager.req_to_blocks - return [ - block.block_id - for block in self.single_type_manager.req_to_blocks[request_id] - ] + return KVCacheBlocks(self.single_type_manager.req_to_blocks[request_id] + ).get_block_ids() diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 27c515835087..403b5401be75 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -577,14 +577,12 @@ def create_kv_cache_group_specs( """ kv_cache_groups = [] for layer_names_one_group in grouped_layer_names: - layer_spec = kv_cache_spec[layer_names_one_group[0]] - assert all( - kv_cache_spec[layer_name] == layer_spec - for layer_name in layer_names_one_group[1:]), ( - "All layers in the same KV cache group must share the same " - "KVCacheSpec.") + layer_specs = [ + kv_cache_spec[layer_name] for layer_name in layer_names_one_group + ] + merged_layer_spec = layer_specs[0].merge(layer_specs) kv_cache_groups.append( - KVCacheGroupSpec(layer_names_one_group, layer_spec)) + KVCacheGroupSpec(layer_names_one_group, merged_layer_spec)) return kv_cache_groups @@ -683,6 +681,7 @@ def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]): head_size=spec.head_size, dtype=spec.dtype, use_mla=spec.use_mla, + sliding_window=spec.sliding_window, ) diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index 24032498e50b..257234430983 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -26,7 +26,7 @@ class NewRequestData: mm_hashes: list[str] mm_positions: list[PlaceholderRange] sampling_params: SamplingParams - block_ids: list[int] + block_ids: list[list[int]] num_computed_tokens: int lora_request: Optional[LoRARequest] @@ -34,7 +34,7 @@ class NewRequestData: def from_request( cls, request: Request, - block_ids: list[int], + block_ids: list[list[int]], ) -> NewRequestData: return cls( req_id=request.request_id, @@ -85,7 +85,7 @@ class CachedRequestData: # request's block IDs instead of appending to the existing block IDs. resumed_from_preemption: bool new_token_ids: list[int] - new_block_ids: list[int] + new_block_ids: list[list[int]] num_computed_tokens: int @classmethod @@ -94,7 +94,7 @@ def from_request( request: Request, resumed_from_preemption: bool, new_token_ids: list[int], - new_block_ids: list[int], + new_block_ids: list[list[int]], ) -> CachedRequestData: return cls( req_id=request.request_id, @@ -131,9 +131,9 @@ class SchedulerOutput: # E.g., if a request has [0, 1], it could mean the vision encoder needs # to process that the request's 0-th and 1-th images in the current step. scheduled_encoder_inputs: dict[str, list[int]] - # Number of common prefix blocks for all requests. + # Number of common prefix blocks for all requests in each KV cache group. # This can be used for cascade attention. - num_common_prefix_blocks: int + num_common_prefix_blocks: list[int] # Request IDs that are finished in between the previous and the current # steps. This is used to notify the workers about the finished requests diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 96313c288f7d..5ad05485e8f3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -173,7 +173,7 @@ def schedule(self) -> SchedulerOutput: # uses structured decoding. structured_output_request_ids: dict[str, int] = {} - req_to_new_block_ids: dict[str, list[int]] = {} + req_to_new_block_ids: dict[str, list[list[int]]] = {} num_scheduled_tokens: dict[str, int] = {} token_budget = self.max_num_scheduled_tokens # Encoder-related. @@ -477,7 +477,8 @@ def schedule(self) -> SchedulerOutput: # Get the longest common prefix among all requests in the running queue. # This can be potentially used for cascade attention. - num_common_prefix_blocks = 0 + num_common_prefix_blocks = [0] * len( + self.kv_cache_config.kv_cache_groups) if self.running: any_request = self.running[0] num_common_prefix_blocks = ( @@ -564,7 +565,7 @@ def _make_cached_request_data( request: Request, num_scheduled_tokens: int, num_scheduled_spec_tokens: int, - new_block_ids: list[int], + new_block_ids: list[list[int]], resumed_from_preemption: bool, ) -> CachedRequestData: # OPTIMIZATION: Cache the CachedRequestData objects to avoid creating @@ -939,7 +940,9 @@ def _connector_finished( """ if self.connector is None: return False, None - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + assert len(self.kv_cache_config.kv_cache_groups + ) == 1, "KV connector only supports one KV cache group now" + block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0] return self.connector.request_finished(request, block_ids) def _update_waiting_for_remote_kv(self, request: Request) -> bool: @@ -956,9 +959,10 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool: """ if request.request_id not in self.finished_recving_kv_req_ids: return False - + assert len(self.kv_cache_config.kv_cache_groups + ) == 1, "KV connector only supports one KV cache group now" # Now that the blocks are ready, actually cache them. - block_ids = self.kv_cache_manager.get_block_ids(request.request_id) + block_ids = self.kv_cache_manager.get_block_ids(request.request_id)[0] num_computed_tokens = len(block_ids) * self.block_size if num_computed_tokens == request.num_tokens: num_computed_tokens -= 1 diff --git a/vllm/v1/kv_cache_interface.py b/vllm/v1/kv_cache_interface.py index 4fc0844cd1f4..2747fc7fabd1 100644 --- a/vllm/v1/kv_cache_interface.py +++ b/vllm/v1/kv_cache_interface.py @@ -1,8 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 +import copy from dataclasses import dataclass +from typing import Optional import torch +from typing_extensions import Self from vllm.config import VllmConfig from vllm.logger import init_logger @@ -53,6 +56,16 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: """ raise NotImplementedError + @classmethod + def merge(cls, specs: list[Self]) -> Self: + """ + Merge a list of KVCacheSpec objects into a single KVCacheSpec object. + """ + assert all(spec.type_id == specs[0].type_id for spec in specs[1:]), ( + "All layers in the same KV cache group must share the same " + "type_id.") + return copy.deepcopy(specs[0]) + @dataclass class AttentionSpec(KVCacheSpec): @@ -71,6 +84,16 @@ def page_size_bytes(self) -> int: @dataclass class FullAttentionSpec(AttentionSpec): + sliding_window: Optional[int] = None + """ + When hybrid allocator is disabled and the model contains both full + attention layers and sliding window attention layers, sliding + window attention are regarded as full attention in KV cache manager + (blocks are allocated for all tokens), while computed as sliding window + attention in model runner. + In this case, we use FullAttentionSpec and record the sliding window size. + Default to None for not using sliding window attention. + """ @property def type_id(self) -> str: @@ -80,6 +103,25 @@ def max_memory_usage_bytes(self, vllm_config: VllmConfig) -> int: max_model_len = vllm_config.model_config.max_model_len return cdiv(max_model_len, self.block_size) * self.page_size_bytes + @classmethod + def merge(cls, specs: list[Self]) -> Self: + """ + Merge a list of FullAttentionSpec objects into a single + FullAttentionSpec object. + """ + merged_spec = super().merge(specs) + sliding_window = set(spec.sliding_window for spec in specs + if spec.sliding_window is not None) + if len(sliding_window) == 0: + merged_spec.sliding_window = None + elif len(sliding_window) == 1: + merged_spec.sliding_window = sliding_window.pop() + else: + raise ValueError( + "All sliding window layers in the same KV cache group " + "must have the same window size.") + return merged_spec + @dataclass class SlidingWindowSpec(AttentionSpec): diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py index 581d3d9bd11b..0c3341691509 100644 --- a/vllm/v1/worker/block_table.py +++ b/vllm/v1/worker/block_table.py @@ -4,6 +4,8 @@ import torch from vllm.logger import init_logger +from vllm.utils import cdiv +from vllm.v1.kv_cache_interface import KVCacheConfig logger = init_logger(__name__) @@ -96,3 +98,48 @@ def get_cpu_tensor(self) -> torch.Tensor: def get_numpy_array(self) -> np.ndarray: """Returns the numpy array of the block table.""" return self.block_table_np + + +class MultiGroupBlockTable: + """The BlockTables for each KV cache group.""" + + def __init__(self, max_num_reqs: int, max_model_len: int, + max_num_batched_tokens: int, pin_memory: bool, + device: torch.device, kv_cache_config: KVCacheConfig) -> None: + max_num_blocks_per_req = [ + cdiv(max_model_len, g.kv_cache_spec.block_size) + for g in kv_cache_config.kv_cache_groups + ] + self.block_tables = [ + BlockTable(max_num_reqs, max_num_blocks_per_req[i], + max_num_batched_tokens, pin_memory, device) + for i in range(len(kv_cache_config.kv_cache_groups)) + ] + + def append_row(self, block_ids: list[list[int]], row_idx: int) -> None: + for i, block_table in enumerate(self.block_tables): + block_table.append_row(block_ids[i], row_idx) + + def add_row(self, block_ids: list[list[int]], row_idx: int) -> None: + for i, block_table in enumerate(self.block_tables): + block_table.add_row(block_ids[i], row_idx) + + def move_row(self, src: int, tgt: int) -> None: + for block_table in self.block_tables: + block_table.move_row(src, tgt) + + def swap_row(self, src: int, tgt: int) -> None: + for block_table in self.block_tables: + block_table.swap_row(src, tgt) + + def commit(self, num_reqs: int) -> None: + for block_table in self.block_tables: + block_table.commit(num_reqs) + + def clear(self) -> None: + for block_table in self.block_tables: + block_table.clear() + + def __getitem__(self, idx: int) -> "BlockTable": + """Returns the BlockTable for the i-th KV cache group.""" + return self.block_tables[idx] diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index 871654fca366..570de9bddd29 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -11,10 +11,11 @@ from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange from vllm.sampling_params import SamplingParams, SamplingType from vllm.utils import swap_dict_values +from vllm.v1.kv_cache_interface import KVCacheConfig from vllm.v1.outputs import LogprobsTensors from vllm.v1.sample.metadata import SamplingMetadata from vllm.v1.utils import copy_slice -from vllm.v1.worker.block_table import BlockTable +from vllm.v1.worker.block_table import MultiGroupBlockTable _SAMPLING_EPS = 1e-5 @@ -29,7 +30,7 @@ class CachedRequestState: sampling_params: SamplingParams generator: Optional[torch.Generator] - block_ids: list[int] + block_ids: list[list[int]] num_computed_tokens: int output_token_ids: list[int] @@ -58,15 +59,14 @@ def __init__( self, max_num_reqs: int, max_model_len: int, - max_num_blocks_per_req: int, max_num_batched_tokens: int, device: torch.device, pin_memory: bool, vocab_size: int, + kv_cache_config: KVCacheConfig, ): self.max_num_reqs = max_num_reqs self.max_model_len = max_model_len - self.max_num_blocks_per_req = max_num_blocks_per_req self.max_num_batched_tokens = max_num_batched_tokens self.device = device self.pin_memory = pin_memory @@ -99,12 +99,13 @@ def __init__( self.num_computed_tokens_cpu_tensor.numpy() # Block table. - self.block_table = BlockTable( + self.block_table = MultiGroupBlockTable( max_num_reqs=max_num_reqs, - max_num_blocks_per_req=max_num_blocks_per_req, + max_model_len=max_model_len, max_num_batched_tokens=max_num_batched_tokens, pin_memory=pin_memory, device=device, + kv_cache_config=kv_cache_config, ) # Sampling-related. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1b16f273a6de..1b34a9fb0616 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -12,6 +12,8 @@ import torch.nn as nn from vllm.attention import AttentionType, get_attn_backend +from vllm.attention.backends.abstract import (AttentionBackend, + AttentionMetadataBuilder) from vllm.attention.layer import Attention from vllm.attention.utils.fa_utils import get_flash_attn_version from vllm.config import (CompilationLevel, VllmConfig, @@ -31,8 +33,8 @@ from vllm.sampling_params import SamplingType from vllm.sequence import IntermediateTensors from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, - GiB_bytes, LayerBlockType, LazyLoader, cdiv, - check_use_alibi, is_pin_memory_available) + GiB_bytes, LazyLoader, cdiv, check_use_alibi, + is_pin_memory_available) from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.utils import CommonAttentionMetadata from vllm.v1.core.encoder_cache_manager import compute_encoder_budget @@ -49,6 +51,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.utils import is_spec_decode_supported from vllm.v1.utils import bind_kv_cache +from vllm.v1.worker.block_table import BlockTable from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin @@ -100,59 +103,17 @@ def __init__( self.kv_cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[ cache_config.cache_dtype] - # NOTE(woosuk): sliding_window is None for models with interleaved - # attention. Use interleaved_sliding_window instead. - self.sliding_window = model_config.get_sliding_window() - self.interleaved_sliding_window = getattr( - model_config.hf_text_config, "interleaved_sliding_window", None) - self.window_size = (self.sliding_window - or self.interleaved_sliding_window) - self.is_multimodal_model = model_config.is_multimodal_model - self.block_size = cache_config.block_size self.max_model_len = model_config.max_model_len - self.max_num_blocks_per_req = cdiv(self.max_model_len, self.block_size) self.max_num_tokens = scheduler_config.max_num_batched_tokens self.max_num_reqs = scheduler_config.max_num_seqs # Model-related. - self.num_attn_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention) self.num_query_heads = model_config.get_num_attention_heads( parallel_config) - self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) - self.head_size = model_config.get_head_size() self.hidden_size = model_config.get_hidden_size() self.attention_chunk_size = model_config.attention_chunk_size - self.attn_backend = get_attn_backend( - self.head_size, - self.dtype, - self.kv_cache_dtype, - self.block_size, - self.model_config.is_attention_free, - use_mla=self.model_config.use_mla, - ) - if self.attn_backend is None: - error_msg = ( - f"Error with get_att_backend: {self.head_size=}, " - f"{self.dtype=}, {self.kv_cache_dtype=}, {self.block_size=}, " - f"{self.model_config.is_attention_free=}, " - f"{self.model_config.use_mla=}") - logger.error(error_msg) - raise NotImplementedError( - "Non-Attention backend is not supported by V1 GPUModelRunner.") - - if self.vllm_config.compilation_config.full_cuda_graph: - attn_backend_name = self.attn_backend.__name__ - flash_attn_version = get_flash_attn_version() - if attn_backend_name != "FlashAttentionBackend" or \ - flash_attn_version != 3: - raise ValueError( - f"full_cuda_graph is only supported with " - f"FA3. Current attention backend is {attn_backend_name}, " - f"FlashAttention version is {flash_attn_version}.") - self.cascade_attn_enabled = not self.model_config.disable_cascade_attn # Multi-modal data support @@ -174,8 +135,10 @@ def __init__( # self.model: nn.Module # Set after load_model # Initialize in initialize_kv_cache self.kv_caches: list[torch.Tensor] = [] + self.attn_metadata_builders: list[AttentionMetadataBuilder] = [] + self.attn_backends: list[type[AttentionBackend]] = [] # self.kv_cache_config: KVCacheConfig - # self.attn_metadata_builder: type[AttentionMetadataBuilder] + # self.input_batch: InputBatch # Persistent batch. # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} @@ -200,16 +163,6 @@ def __init__( # Request states. self.requests: dict[str, CachedRequestState] = {} - # Persistent batch. - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_blocks_per_req=self.max_num_blocks_per_req, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=model_config.get_vocab_size(), - ) self.use_cuda_graph = (self.vllm_config.compilation_config.level == CompilationLevel.PIECEWISE @@ -304,6 +257,31 @@ def __init__( pin_memory=self.pin_memory) self.seq_lens_np = self.seq_lens_cpu.numpy() + def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool: + """ + Update the order of requests in the batch based on the attention + backend's needs. For example, some attention backends (namely MLA) may + want to separate requests based on if the attention computation will be + compute-bound or memory-bound. + + Args: + scheduler_output: The scheduler output. + + Returns: + True if the batch was reordered, False otherwise. + """ + batch_reordered = self.attn_metadata_builders[0].reorder_batch( + self.input_batch, scheduler_output) + + # For models with multiple KV cache groups, the groups should agree on + # the same order of requests. We ensure this by only allowing the first + # group to reorder the batch and asserting that all other groups do not + # reorder the batch. + for i in range(1, len(self.kv_cache_config.kv_cache_groups)): + assert not self.attn_metadata_builders[i].reorder_batch( + self.input_batch, scheduler_output) + return batch_reordered + def _update_states(self, scheduler_output: "SchedulerOutput") -> None: """Update the cached states and the persistent batch with the scheduler output. @@ -440,7 +418,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: # Update the block IDs. if not req_data.resumed_from_preemption: # Append the new blocks to the existing block IDs. - req_state.block_ids.extend(req_data.new_block_ids) + for i in range(len(self.kv_cache_config.kv_cache_groups)): + req_state.block_ids[i].extend(req_data.new_block_ids[i]) else: # The request is resumed from preemption. # Replace the existing block IDs with the new ones. @@ -498,11 +477,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None: if removed_req_indices: self.input_batch.condense(removed_req_indices) - # Some attention backends (namely MLA) may want to separate requests - # based on if the attention computation will be compute-bound or - # memory-bound. This gives them a hook to do that. - batch_reordered = self.attn_metadata_builder.reorder_batch( - self.input_batch, scheduler_output) + batch_reordered = self._may_reorder_batch(scheduler_output) if batch_changed or batch_reordered: self.input_batch.refresh_sampling_metadata() @@ -570,21 +545,29 @@ def _prepare_inputs( torch.from_numpy(token_indices), out=self.input_ids_cpu[:total_num_scheduled_tokens]) - # Calculate the slot mapping. - # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] - # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] - # where K is the max_num_blocks_per_req and the block size is 2. - # NOTE(woosuk): We can't simply use `token_indices // block_size` here - # because M (max_model_len) is not necessarily divisible by block_size. - block_table_indices = (req_indices * self.max_num_blocks_per_req + - positions_np // self.block_size) - block_table_cpu = self.input_batch.block_table.get_cpu_tensor() - block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() - block_offsets = positions_np % self.block_size - np.add(block_numbers * self.block_size, - block_offsets, - out=self.input_batch.block_table. - slot_mapping_np[:total_num_scheduled_tokens]) + # Calculate the slot mapping for each KV cache group. + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + block_size = kv_cache_group_spec.kv_cache_spec.block_size + block_table: BlockTable = self.input_batch.block_table[ + kv_cache_group_id] + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` + # here because M (max_model_len) is not necessarily divisible by + # block_size. + block_table_indices = ( + req_indices * block_table.max_num_blocks_per_req + + positions_np // block_size) + block_table_cpu = block_table.get_cpu_tensor() + block_numbers = block_table_cpu.flatten( + )[block_table_indices].numpy() + block_offsets = positions_np % block_size + np.add( + block_numbers * block_size, + block_offsets, + out=block_table.slot_mapping_np[:total_num_scheduled_tokens]) # Prepare the attention metadata. self.query_start_loc_np[0] = 0 @@ -626,10 +609,6 @@ def _prepare_inputs( attn_metadata: dict[str, FlashAttentionMetadata] = {} # Prepare the attention metadata for each KV cache group and make layers # in the same group share the same metadata. - # NOTE(Chen): there is exactly one KV cache group that contains all - # attetnion layers in the model for now, so the current logic for - # getting attn_metadata is not related to kv_cache_group information. - # Will extend this part to support multiple KV cache groups later. for kv_cache_group_id, kv_cache_group_spec in enumerate( self.kv_cache_config.kv_cache_groups): @@ -638,15 +617,19 @@ def _prepare_inputs( if self.cascade_attn_enabled: common_prefix_len = self._compute_cascade_attn_prefix_len( num_scheduled_tokens, - scheduler_output.num_common_prefix_blocks, + scheduler_output. + num_common_prefix_blocks[kv_cache_group_id], + kv_cache_group_spec.kv_cache_spec, + self.attn_metadata_builders[kv_cache_group_id], ) - attn_metadata_i = self.attn_metadata_builder.build( - num_reqs=num_reqs, - num_actual_tokens=total_num_scheduled_tokens, - max_query_len=max_num_scheduled_tokens, - common_prefix_len=common_prefix_len, - common_attn_metadata=common_attn_metadata) + attn_metadata_i = ( + self.attn_metadata_builders[kv_cache_group_id].build( + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + common_prefix_len=common_prefix_len, + common_attn_metadata=common_attn_metadata)) for layer_name in kv_cache_group_spec.layer_names: attn_metadata[layer_name] = attn_metadata_i @@ -684,6 +667,8 @@ def _compute_cascade_attn_prefix_len( self, num_scheduled_tokens: np.ndarray, num_common_prefix_blocks: int, + kv_cache_spec: KVCacheSpec, + attn_metadata_builder: AttentionMetadataBuilder, ) -> int: """Compute the length of the common prefix for cascade attention. @@ -702,7 +687,7 @@ def _compute_cascade_attn_prefix_len( Returns: int: Length of common prefix in tokens. """ - common_prefix_len = num_common_prefix_blocks * self.block_size + common_prefix_len = num_common_prefix_blocks * kv_cache_spec.block_size if common_prefix_len == 0: # Common case. return 0 @@ -751,15 +736,19 @@ def _compute_cascade_attn_prefix_len( common_prefix_len, self.input_batch.num_computed_tokens_cpu[:num_reqs].min()) # common_prefix_len should be a multiple of the block size. - common_prefix_len = (common_prefix_len // self.block_size * - self.block_size) - use_cascade = self.attn_metadata_builder.use_cascade_attention( + common_prefix_len = (common_prefix_len // kv_cache_spec.block_size * + kv_cache_spec.block_size) + use_sliding_window = (isinstance(kv_cache_spec, SlidingWindowSpec) or + (isinstance(kv_cache_spec, FullAttentionSpec) + and kv_cache_spec.sliding_window is not None)) + assert isinstance(kv_cache_spec, AttentionSpec) + use_cascade = attn_metadata_builder.use_cascade_attention( common_prefix_len=common_prefix_len, query_lens=num_scheduled_tokens, num_query_heads=self.num_query_heads, - num_kv_heads=self.num_kv_heads, + num_kv_heads=kv_cache_spec.num_kv_heads, use_alibi=self.use_alibi, - use_sliding_window=self.window_size is not None, + use_sliding_window=use_sliding_window, num_sms=self.num_sms, ) return common_prefix_len if use_cascade else 0 @@ -1577,7 +1566,7 @@ def _dummy_run( dtype=np.int32) if skip_attn: - attn_metadata = None + attn_metadata: Optional[dict[str, FlashAttentionMetadata]] = None else: query_start_loc = self.query_start_loc[:num_reqs + 1] seq_lens = self.seq_lens[:num_reqs] @@ -1585,13 +1574,19 @@ def _dummy_run( common_attn_metadata = CommonAttentionMetadata( query_start_loc=query_start_loc, seq_lens=seq_lens) - attn_metadata = self.attn_metadata_builder.build( - num_reqs=num_tokens, - num_actual_tokens=num_tokens, - max_query_len=num_tokens, - common_prefix_len=0, - common_attn_metadata=common_attn_metadata, - ) + attn_metadata = {} + for kv_cache_group_id, kv_cache_group_spec in enumerate( + self.kv_cache_config.kv_cache_groups): + attn_metadata_i = ( + self.attn_metadata_builders[kv_cache_group_id].build( + num_reqs=num_tokens, + num_actual_tokens=num_tokens, + max_query_len=num_tokens, + common_prefix_len=0, + common_attn_metadata=common_attn_metadata, + )) + for layer_name in kv_cache_group_spec.layer_names: + attn_metadata[layer_name] = attn_metadata_i with self.maybe_dummy_run_with_lora(self.lora_config, num_scheduled_tokens): @@ -1822,6 +1817,56 @@ def capture_model(self) -> None: logger.info("Graph capturing finished in %.0f secs, took %.2f GiB", elapsed_time, cuda_graph_size / (1 << 30)) + def initialize_attn_backend(self, kv_cache_config: KVCacheConfig) -> None: + """ + Initialize the attention backends and attention metadata builders. + """ + assert len(self.attn_backends) == 0 and len( + self.attn_metadata_builders + ) == 0, "Attention backends are already initialized" + for i, kv_cache_group_spec in enumerate( + kv_cache_config.kv_cache_groups): + kv_cache_spec = kv_cache_group_spec.kv_cache_spec + if not isinstance(kv_cache_spec, AttentionSpec): + raise NotImplementedError( + "Only AttentionSpec is supported for now.") + attn_backend_i = get_attn_backend( + kv_cache_spec.head_size, + self.dtype, + kv_cache_spec.dtype, + kv_cache_spec.block_size, + self.model_config.is_attention_free, + use_mla=kv_cache_spec.use_mla, + ) + if attn_backend_i is None: + error_msg = ( + f"Error with get_attn_backend: {kv_cache_spec.head_size=}, " + f"{self.dtype=}, {kv_cache_spec.dtype=}, " + f"{kv_cache_spec.block_size=}, " + f"{self.model_config.is_attention_free=}, " + f"{kv_cache_spec.use_mla=}") + logger.error(error_msg) + raise NotImplementedError( + "Non-Attention backend is not supported by V1 " + "GPUModelRunner.") + + if self.vllm_config.compilation_config.full_cuda_graph: + attn_backend_name = attn_backend_i.__name__ + flash_attn_version = get_flash_attn_version() + if attn_backend_name != "FlashAttentionBackend" or \ + flash_attn_version != 3: + raise ValueError( + f"full_cuda_graph is only supported with " + f"FA3. Current attention backend is " + f"{attn_backend_name}, FlashAttention version is " + f"{flash_attn_version}.") + + block_table_i = self.input_batch.block_table[i] + attn_metadata_builder_i = attn_backend_i.get_builder_cls()( + weakref.proxy(self), kv_cache_spec, block_table_i) + self.attn_backends.append(attn_backend_i) + self.attn_metadata_builders.append(attn_metadata_builder_i) + def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: """ Initialize KV cache based on `kv_cache_config`. @@ -1829,15 +1874,21 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: kv_cache_config: Configuration for the KV cache, including the KV cache size of each layer """ - if len(kv_cache_config.kv_cache_groups) > 1: - raise NotImplementedError( - "Hybrid models with more than one KV cache type are not " - "supported yet.") self.kv_cache_config = kv_cache_config + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + kv_cache_config=kv_cache_config, + ) + self.initialize_attn_backend(kv_cache_config) kv_caches: dict[str, torch.Tensor] = {} - for kv_cache_group in kv_cache_config.kv_cache_groups: + for i, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups): kv_cache_spec = kv_cache_group.kv_cache_spec for layer_name in kv_cache_group.layer_names: tensor_config = kv_cache_config.tensors[layer_name] @@ -1852,7 +1903,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: # the min of all `num_blocks`. Verify it here. assert num_blocks >= kv_cache_config.num_blocks if isinstance(kv_cache_spec, AttentionSpec): - kv_cache_shape = self.attn_backend.get_kv_cache_shape( + kv_cache_shape = self.attn_backends[i].get_kv_cache_shape( num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, kv_cache_spec.head_size) dtype = kv_cache_spec.dtype @@ -1872,11 +1923,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: if has_kv_transfer_group(): get_kv_transfer_group().register_kv_caches(kv_caches) - self.attn_metadata_builder = self.attn_backend.get_builder_cls()( - weakref.proxy(self), - kv_cache_config.kv_cache_groups[0].kv_cache_spec, - self.input_batch.block_table) - def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]: """ Generates the KVCacheSpec by parsing the kv cache format from each diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index b4daf5a34678..2da99696445e 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -171,19 +171,10 @@ def __init__( self.kv_caches: list[torch.Tensor] = [] # req_id -> (input_id -> encoder_output) self.encoder_cache: dict[str, dict[int, torch.Tensor]] = {} + # self.input_batch: InputBatch # Persistent batch. # Request states. self.requests: dict[str, CachedRequestState] = {} - # Persistent batch. - self.input_batch = InputBatch( - max_num_reqs=self.max_num_reqs, - max_model_len=self.max_model_len, - max_num_blocks_per_req=self.max_num_blocks_per_req, - max_num_batched_tokens=self.max_num_tokens, - device=self.device, - pin_memory=self.pin_memory, - vocab_size=self.vocab_size, - ) # Cached torch/numpy tensor # The pytorch tensor and numpy array share the same buffer. @@ -199,7 +190,7 @@ def __init__( self.block_table_cpu = torch.zeros( (self.max_num_reqs, self.max_num_blocks_per_req), - dtype=self.input_batch.block_table.get_cpu_tensor().dtype, + dtype=torch.int32, device="cpu") self.query_start_loc_cpu = torch.zeros(self.max_num_tokens + 1, @@ -524,12 +515,12 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): # NOTE(woosuk): We use torch.index_select instead of np.take here # because torch.index_select is much faster than np.take for large # tensors. - block_table_cpu = self.input_batch.block_table.get_cpu_tensor() + block_table_cpu = self.input_batch.block_table[0].get_cpu_tensor() block_numbers = block_table_cpu.flatten()[block_table_indices].numpy() block_offsets = positions_np % self.block_size np.add(block_numbers * self.block_size, block_offsets, - out=self.input_batch.block_table. + out=self.input_batch.block_table[0]. slot_mapping_np[:total_num_scheduled_tokens]) # Prepare the attention metadata. @@ -554,15 +545,15 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"): self.position_ids = self.positions_cpu[: padded_total_num_scheduled_tokens].to( self.device) - self.input_batch.block_table.slot_mapping_cpu[ + self.input_batch.block_table[0].slot_mapping_cpu[ total_num_scheduled_tokens:] = _PAD_SLOT_ID slot_mapping = ( - self.input_batch.block_table. + self.input_batch.block_table[0]. slot_mapping_cpu[:padded_total_num_scheduled_tokens].to( self.device)) block_tables = self.block_table_cpu[:self.max_num_reqs] block_tables[:num_reqs, :self.max_num_blocks_per_req] = ( - self.input_batch.block_table.get_cpu_tensor()[:num_reqs]) + self.input_batch.block_table[0].get_cpu_tensor()[:num_reqs]) block_tables = block_tables.to(self.device) query_start_loc = self.query_start_loc_cpu[:self.max_num_reqs + 1].to( self.device) @@ -1263,6 +1254,18 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: "Hybrid models with more than one KV cache type are not " "supported yet.") + self.input_batch = InputBatch( + max_num_reqs=self.max_num_reqs, + max_model_len=self.max_model_len, + max_num_batched_tokens=self.max_num_tokens, + device=self.device, + pin_memory=self.pin_memory, + vocab_size=self.model_config.get_vocab_size(), + kv_cache_config=kv_cache_config, + ) + assert self.block_table_cpu.dtype == self.input_batch.block_table[ + 0].get_cpu_tensor().dtype + kv_caches: dict[str, torch.Tensor] = {} for kv_cache_group in kv_cache_config.kv_cache_groups: From 65334ef3b9e4fd32ebc5c4e512debc25d5025488 Mon Sep 17 00:00:00 2001 From: Mark McLoughlin Date: Thu, 15 May 2025 04:13:17 +0100 Subject: [PATCH 17/58] [V1][Metrics] Remove unused code (#18158) Signed-off-by: Mark McLoughlin --- .buildkite/test-pipeline.yaml | 1 - tests/v1/test_stats.py | 302 ----------------------- vllm/v1/stats/__init__.py | 0 vllm/v1/stats/common.py | 453 ---------------------------------- 4 files changed, 756 deletions(-) delete mode 100644 tests/v1/test_stats.py delete mode 100644 vllm/v1/stats/__init__.py delete mode 100644 vllm/v1/stats/common.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1040d1e1b801..1eb3e1f4c482 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -216,7 +216,6 @@ steps: - pytest -v -s v1/spec_decode - pytest -v -s v1/kv_connector/unit - pytest -v -s v1/test_serial_utils.py - - pytest -v -s v1/test_stats.py - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_oracle.py # TODO: accuracy does not match, whether setting diff --git a/tests/v1/test_stats.py b/tests/v1/test_stats.py deleted file mode 100644 index 48419d8a2791..000000000000 --- a/tests/v1/test_stats.py +++ /dev/null @@ -1,302 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import pytest - -from vllm.sampling_params import SamplingParams -from vllm.v1.stats.common import RequestStats, RequestStatsUpdate - - -def make_update( - request_id: str, - update_type: RequestStatsUpdate.Type, - monotonic_ts_s: float, - **kwargs, -): - if update_type == RequestStatsUpdate.Type.INPUT_PROCESSED: - kwargs.setdefault("sampling_params", SamplingParams(n=1)) - kwargs.setdefault("num_prompt_tokens", 10) - elif update_type == RequestStatsUpdate.Type.PREFILLING: - kwargs.setdefault("num_computed_tokens", 10) - kwargs.setdefault("num_cached_tokens", 10) - elif update_type == RequestStatsUpdate.Type.DETOKENIZED: - kwargs.setdefault("num_new_tokens", 10) - elif update_type == RequestStatsUpdate.Type.FINISHED: - kwargs.setdefault("finish_reason", "test_reason") - - return RequestStatsUpdate( - request_id=request_id, - type=update_type, - monotonic_ts_s=monotonic_ts_s, - **kwargs, - ) - - -def test_invalid_request_update(): - request_id = "test_request" - update_specific_required_fields = { - RequestStatsUpdate.Type.INPUT_PROCESSED: [ - "sampling_params", - "num_prompt_tokens", - ], - RequestStatsUpdate.Type.PREFILLING: [ - "num_computed_tokens", - "num_cached_tokens", - ], - RequestStatsUpdate.Type.DETOKENIZED: ["num_new_tokens"], - RequestStatsUpdate.Type.FINISHED: ["finish_reason"], - } - - # Missing a required field should raise an assertion error. - for update_type in RequestStatsUpdate.Type: - required_fields = update_specific_required_fields.get(update_type, []) - - # Try to miss one of the required fields. - kwargs = {field: object() for field in required_fields} - for field in required_fields: - copy_kwargs = kwargs.copy() - copy_kwargs.pop(field) - with pytest.raises(ValueError): - RequestStatsUpdate( - request_id=request_id, - type=update_type, - **copy_kwargs, - ) - - -def test_invalid_request_update_transition(): - # Test invalid transition type. - for src in RequestStatsUpdate.Type: - for dst in RequestStatsUpdate.Type: - if dst not in RequestStatsUpdate._VALID_TRANSITIONS[src]: - with pytest.raises(AssertionError): - RequestStatsUpdate.check_valid_update( - make_update( - update_type=dst, - request_id="test_request", - monotonic_ts_s=1, - ), - last_update_type=src, - last_updated_ts_s=0, - ) - else: - RequestStatsUpdate.check_valid_update( - make_update( - request_id="test_request", - update_type=dst, - monotonic_ts_s=1, - ), - last_update_type=src, - last_updated_ts_s=0, - ) - - # Test invalid timestamp. - with pytest.raises(AssertionError): - RequestStatsUpdate.check_valid_update( - make_update( - request_id="test_request", - update_type=RequestStatsUpdate.Type.ARRIVED, - monotonic_ts_s=1, - ), - last_update_type=None, - last_updated_ts_s=2, - ) - - -def test_lifecycle_updates(): - request_id = "test_request" - stats = RequestStats(request_id=request_id) - - # Test the below scenario: - arrived_ts = 0 - input_processed_ts = 1 - queued_ts = 2 - prefilling_ts = 3 - decoded_ts = 5 - detokenized_ts = 6 - decoded_2_ts = 7 - detokenized_2_ts = 8 - preempted_ts = 9 - resumed_ts = 10 - decoded_3_ts = 11 - detokenized_3_ts = 12 - finished_ts = 13 - - # Test ARRIVED - arrived_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.ARRIVED, - monotonic_ts_s=arrived_ts, - ) - stats.update_from(arrived_update) - assert stats.arrival_ts_s == arrived_ts - assert stats.last_updated_ts_s == arrived_ts - - # Test INPUT_PROCESSED - sampling_params = SamplingParams(n=1) - input_processed_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.INPUT_PROCESSED, - monotonic_ts_s=input_processed_ts, - sampling_params=sampling_params, - num_prompt_tokens=6, - ) - stats.update_from(input_processed_update) - assert stats.input_processor_end_ts_s == input_processed_ts - assert stats.last_updated_ts_s == input_processed_ts - assert stats.num_prompt_tokens == 6 - assert stats.sampling_params == sampling_params - - assert stats.first_token_ts_s is None - assert stats.prefill_ts_s is None - - # Test QUEUED - queued_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.QUEUED, - monotonic_ts_s=queued_ts, - ) - stats.update_from(queued_update) - assert stats.queued_ts_s == queued_ts - assert stats.last_updated_ts_s == queued_ts - - # Test PREFILLING - prefilling_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.PREFILLING, - monotonic_ts_s=prefilling_ts, - num_computed_tokens=3, - num_cached_tokens=1, - ) - stats.update_from(prefilling_update) - assert stats.prefill_ts_s == prefilling_ts - assert stats.num_computed_tokens == 3 - assert stats.num_cached_tokens == 1 - assert stats.queue_duration_s == prefilling_ts - queued_ts - - # Test DECODING - decoded_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DECODING, - monotonic_ts_s=decoded_ts, - ) - stats.update_from(decoded_update) - assert stats.last_updated_ts_s == decoded_ts - - # Test DETOKENIZED - detokenized_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DETOKENIZED, - monotonic_ts_s=detokenized_ts, - num_new_tokens=1, - ) - stats.update_from(detokenized_update) - assert stats.last_updated_ts_s == detokenized_ts - assert stats.num_output_tokens == 1 - # Since arrival - assert stats.first_token_latency_s == detokenized_ts - arrived_ts - # Since first scheduled - assert stats.prefill_latency_s == detokenized_ts - prefilling_ts - - # Test another DECODING and DETOKENIZED should - # yield correct inter token latency - decoded_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DECODING, - monotonic_ts_s=decoded_2_ts, - ) - stats.update_from(decoded_update) - - detokenized_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DETOKENIZED, - monotonic_ts_s=detokenized_2_ts, - num_new_tokens=1, - ) - stats.update_from(detokenized_update) - assert stats.output_token_latency_s_lst == [ - detokenized_2_ts - detokenized_ts, - ] - assert stats.num_output_tokens == 2 - - # Test PREEMPTED - preempted_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.PREEMPTED, - monotonic_ts_s=preempted_ts, - ) - stats.update_from(preempted_update) - assert stats.last_updated_ts_s == preempted_ts - assert stats.preempted_ts_s_lst == [preempted_ts] - # States should be reset - assert stats.num_computed_tokens == 0 - assert stats.num_cached_tokens == 0 - # These states should not be reset - assert stats.num_output_tokens == 2 - assert stats.output_token_latency_s_lst == [ - detokenized_2_ts - detokenized_ts, - ] - assert stats.prefill_latency_s == prefilling_ts - arrived_ts - assert stats.num_prompt_tokens == 6 - assert stats.prefill_start_ts_s_lst == [prefilling_ts] - - # Test resumed - resumed_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.PREFILLING, - monotonic_ts_s=resumed_ts, - num_computed_tokens=6, - num_cached_tokens=2, - ) - stats.update_from(resumed_update) - # prefill timestamp should not be updated since it's a resumed prefill - assert stats.prefill_ts_s == prefilling_ts - assert stats.num_computed_tokens == 6 - assert stats.num_cached_tokens == 2 - assert stats.prefill_start_ts_s_lst == [ - prefilling_ts, - resumed_ts, - ] - assert stats.last_updated_ts_s == resumed_ts - - # Test another DECODED/DETOKENIZED should yield correct first token latency. - decoded_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DECODING, - monotonic_ts_s=decoded_3_ts, - ) - detokenized_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.DETOKENIZED, - monotonic_ts_s=detokenized_3_ts, - num_new_tokens=1, - ) - stats.update_from(decoded_update) - stats.update_from(detokenized_update) - assert stats.first_token_ts_s == detokenized_ts - arrived_ts - assert stats.num_output_tokens == 3 - assert stats.output_token_latency_s_lst == [ - detokenized_2_ts - detokenized_ts, - detokenized_3_ts - detokenized_2_ts, - ] - - # Test FINISHED - finished_update = RequestStatsUpdate( - request_id=request_id, - type=RequestStatsUpdate.Type.FINISHED, - monotonic_ts_s=finished_ts, - finish_reason="test_reason", - ) - stats.update_from(finished_update) - assert stats.last_updated_ts_s == finished_ts - assert stats.e2e_latency_s == finished_ts - arrived_ts - assert stats.inference_latency_s == finished_ts - prefilling_ts - assert stats.prefill_latency_s == detokenized_ts - prefilling_ts - assert stats.decode_latency_s == finished_ts - detokenized_ts - assert stats.first_token_latency_s == detokenized_ts - arrived_ts - assert stats.queue_duration_s == prefilling_ts - queued_ts - assert stats.is_finished - assert stats.finish_reason == "test_reason" - - # TODO(rickyx): Add model forward/execute time. - assert stats.model_forward_duration_s == 0.0 - assert stats.model_execute_duration_s == 0.0 diff --git a/vllm/v1/stats/__init__.py b/vllm/v1/stats/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/vllm/v1/stats/common.py b/vllm/v1/stats/common.py deleted file mode 100644 index 46818977dae5..000000000000 --- a/vllm/v1/stats/common.py +++ /dev/null @@ -1,453 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import time -from dataclasses import dataclass -from dataclasses import field as dataclass_field -from enum import IntEnum -from typing import ClassVar, Optional - -import msgspec -from msgspec import field as msgspec_field - -from vllm.sampling_params import SamplingParams - - -class RequestStatsUpdate( - msgspec.Struct, # type: ignore - array_like=True, - omit_defaults=True, - gc=False): - """ - An update to the request stats. - - This represents a stats update at a specific timestamp with metadata - associated with the update. - - NOTE: since there might be multiple processes generating updates at - different parts of the engine (e.g. input processor, scheduler, engine core, - etc.), we use the monotonic timestamp to record the update to compute any - intervals, and explicit wall-clock timestamp should be used for timestamps. - - WARNING: This assumes stats are generated in a single machine. If there are - potentially multiple machines, one should always generate the stats updates - on one single machine or use something else. - """ - - class Type(IntEnum): - """See `RequestStats` for the lifecycle of a request.""" - - # Request arrived at the engine frontend. - ARRIVED = 0 - # Input processed by the input processor. - INPUT_PROCESSED = 1 - # Queued on the engine core. - QUEUED = 2 - # Scheduled running prefill by the scheduler. - # A request could be running a new prefill on the prompt tokens or - # a resumed prefill on the original prefill tokens + generated output - # tokens before preemption. - PREFILLING = 3 - # Preempted by the scheduler. - PREEMPTED = 4 - # Output token is generated by the engine core. - DECODING = 5 - # Token detokenized by the detokenizer. - # We will record the timestamp for each output token, as well as the - # finish reason. - DETOKENIZED = 6 - # Request finishes (or aborts). - FINISHED = 7 - - """ - Valid state updates: - ARRIVED - │ - ├──────► INPUT_PROCESSED ──────► QUEUED ──────► PREFILLING ◄────┐ - │ │ │ │ │ - │ │ │ ▼ │ - │ │ │ -──► DECODING │ - │ │ │ | │ │ - │ │ │ | ▼ │ - │ │ │ └─ DETOKENIZED │ - │ │ │ │ │ - │ │ │ ▼ │ - │ ▼ ▼ PREEMPTED ◄──────┘ - │ │ │ │ - └──────────────┴───────────────────┴──────────────┴ - │ - ▼ - FINISHED (All could go to FINISHED) - """ - _VALID_TRANSITIONS: ClassVar[dict[Type, set[Type]]] = { - Type.ARRIVED: { - Type.INPUT_PROCESSED, - Type.FINISHED, - }, - Type.INPUT_PROCESSED: { - Type.QUEUED, - Type.FINISHED, - }, - Type.QUEUED: { - Type.PREFILLING, - Type.FINISHED, - }, - Type.PREFILLING: { - Type.DECODING, - Type.PREEMPTED, - Type.FINISHED, - }, - Type.DECODING: { - Type.DETOKENIZED, - Type.FINISHED, - }, - Type.DETOKENIZED: { - Type.DECODING, - Type.PREEMPTED, - Type.FINISHED, - }, - Type.PREEMPTED: {Type.PREFILLING, Type.FINISHED}, - Type.FINISHED: set(), - } - - request_id: str - - type: Type - - # Timestamp when the update is recorded. This is used to record time - # intervals between events rather than wall clock time. - monotonic_ts_s: float = msgspec_field( - default_factory=lambda: time.monotonic()) - - ############################################################ - # Metadata associated with the update. - ############################################################ - # For input_processed. Metadata needed for stats logging. - num_prompt_tokens: Optional[int] = None - sampling_params: Optional[SamplingParams] = None - - # For running. - # Number of tokens computed when scheduled to run. - num_computed_tokens: Optional[int] = None - # Number of cached tokens when scheduled to run. - num_cached_tokens: Optional[int] = None - - # For decoded. - # The number of new output tokens generated. - num_new_tokens: Optional[int] = None - - # For both detokenized and decoded. - # Finished reason. - finish_reason: Optional[str] = None - - # Non-optional fields for each update type. - _REQUIRED_FIELDS: ClassVar[dict[Type, list[str]]] = { - Type.INPUT_PROCESSED: ["num_prompt_tokens", "sampling_params"], - Type.PREFILLING: ["num_computed_tokens", "num_cached_tokens"], - Type.DETOKENIZED: ["num_new_tokens"], - Type.FINISHED: ["finish_reason"], - } - - def __post_init__(self): - required_fields = self._REQUIRED_FIELDS.get(self.type, []) - for field in required_fields: - if getattr(self, field) is None: - raise ValueError( - f"Field {field} is required for update type {self.type}.") - - @staticmethod - def check_valid_update( - update: "RequestStatsUpdate", - last_update_type: Optional[Type], - last_updated_ts_s: Optional[float], - ): - if last_update_type is None: - assert update.type == RequestStatsUpdate.Type.ARRIVED - else: - valid_cur_update_types = RequestStatsUpdate._VALID_TRANSITIONS[ - last_update_type] - assert update.type in valid_cur_update_types, ( - f"Invalid update type: {update.type} for last_update_type: " - f"{last_update_type}.") - - if last_updated_ts_s is not None: - assert update.monotonic_ts_s >= last_updated_ts_s, ( - "Update timestamp must be monotonically increasing, but " - f"last_updated_ts_s={last_updated_ts_s} and " - f"update.monotonic_ts_s={update.monotonic_ts_s}.") - - -@dataclass -class RequestStats: - """Stats associated with a request (`Request`).""" - - ############################################################ - # Metadata - ############################################################ - request_id: str - sampling_params: Optional[SamplingParams] = None - num_prompt_tokens: Optional[int] = None - - ############################################################ - # Metrics and Stats - ############################################################ - # Timestamp when the request was last updated. - last_updated_ts_s: Optional[float] = None - - # Last update stats type. - last_update_type: Optional[RequestStatsUpdate.Type] = None - - # Timestamp when the request arrived at the llm engine. - arrival_ts_s: Optional[float] = None - - # Number of tokens cached. When part of the request prefix is cached, - # this will be set. - num_cached_tokens: int = 0 - - # Number of tokens computed. - num_computed_tokens: int = 0 - - # The timestamp when the request become waiting in the queue. - queued_ts_s: Optional[float] = None - - # When the input processor is completed. - input_processor_end_ts_s: Optional[float] = None - - # A sorted list of timestamps when the request was scheduled to prefill. - # This could be when: - # 1. the request is newly scheduled, so it's a new prefill. - # 2. the request was preempted and resumed. It is equivalent to running - # a prefill of the original prefill tokens + generated output tokens - # before preemption. - prefill_start_ts_s_lst: list[float] = dataclass_field(default_factory=list) - - # A list of timestamps when a token is decoded by the engine core. - decoding_ts_s_lst: list[float] = dataclass_field(default_factory=list) - - # A sorted list of timestamps for each output token. - output_token_ts_s_lst: list[float] = dataclass_field(default_factory=list) - - # First token's timestamp. - first_token_ts_s: Optional[float] = None - - # TODO(rickyx): we need model runner to surface these. - model_forward_duration_s: float = 0.0 - # Includes model forward, block/sync across workers, cpu-gpu sync time - # and sampling time. - model_execute_duration_s: float = 0.0 - - # A sorted list of timestamps when the request was preempted at the - # scheduler. - # TODO(rickyx): right now, we don't actually have a good high-level - # metric to measure the impact of preemption other than observation of - # large P99 TPOT. Ideally we could quantify the impact of preemption by - # measuring the number of tokens re-computed due to preemption. - preempted_ts_s_lst: list[float] = dataclass_field(default_factory=list) - - # Timestamp when the request was finished at the engine core. - finished_ts_s: Optional[float] = None - - # Finish reason. - finish_reason: Optional[str] = None - - ############################################################ - # Derived properties. - ############################################################ - @property - def prefill_ts_s(self) -> Optional[float]: - """The timestamp when the request started prefilling. - Since a request could be preempted in decoding and later resumed - to prefill the decoded tokens, we use the first prefill start timestamp. - """ - return (self.prefill_start_ts_s_lst[0] - if self.prefill_start_ts_s_lst else None) - - @property - def e2e_latency_s(self) -> Optional[float]: - if self.finished_ts_s is None or self.arrival_ts_s is None: - return None - assert self.finished_ts_s >= self.arrival_ts_s - return self.finished_ts_s - self.arrival_ts_s - - @property - def queue_duration_s(self) -> Optional[float]: - """How long the request was waiting to run.""" - if self.queued_ts_s is None or self.prefill_ts_s is None: - # Either not queued or not running yet. - return None - assert self.queued_ts_s <= self.prefill_ts_s - return self.prefill_ts_s - self.queued_ts_s - - @property - def inference_latency_s(self) -> Optional[float]: - """How long the request was running inference - (prefill and decode).""" - if self.finished_ts_s is None or self.prefill_ts_s is None: - return None - assert self.finished_ts_s >= self.prefill_ts_s - return self.finished_ts_s - self.prefill_ts_s - - @property - def first_token_latency_s(self) -> Optional[float]: - if self.first_token_ts_s is None or self.arrival_ts_s is None: - return None - assert self.first_token_ts_s >= self.arrival_ts_s - return self.first_token_ts_s - self.arrival_ts_s - - @property - def prefill_latency_s(self) -> Optional[float]: - if self.first_token_ts_s is None or self.prefill_ts_s is None: - return None - assert self.first_token_ts_s >= self.prefill_ts_s - return self.first_token_ts_s - self.prefill_ts_s - - @property - def decode_latency_s(self) -> Optional[float]: - if self.e2e_latency_s is None or self.first_token_latency_s is None: - return None - assert self.e2e_latency_s >= self.first_token_latency_s - return self.e2e_latency_s - self.first_token_latency_s - - @property - def output_token_latency_s_lst(self) -> list[float]: - if len(self.output_token_ts_s_lst) == 0: - return [] - latency_s_lst = [] - for i in range(1, len(self.output_token_ts_s_lst)): - assert (self.output_token_ts_s_lst[i] - >= self.output_token_ts_s_lst[i - 1]) - latency_s = (self.output_token_ts_s_lst[i] - - self.output_token_ts_s_lst[i - 1]) - latency_s_lst.append(latency_s) - return latency_s_lst - - @property - def num_output_tokens(self) -> int: - return len(self.output_token_ts_s_lst) - - @property - def is_finished(self) -> bool: - return self.finished_ts_s is not None - - def update_from(self, update: "RequestStatsUpdate"): - RequestStatsUpdate.check_valid_update(update, self.last_update_type, - self.last_updated_ts_s) - ts = update.monotonic_ts_s - self.last_updated_ts_s = ts - self.last_update_type = update.type - if update.type == RequestStatsUpdate.Type.ARRIVED: - self.arrival_ts_s = ts - elif update.type == RequestStatsUpdate.Type.INPUT_PROCESSED: - self.input_processor_end_ts_s = ts - self.sampling_params = update.sampling_params - self.num_prompt_tokens = update.num_prompt_tokens - elif update.type == RequestStatsUpdate.Type.QUEUED: - self.queued_ts_s = ts - elif update.type == RequestStatsUpdate.Type.PREFILLING: - self.prefill_start_ts_s_lst.append(ts) - self.num_cached_tokens = update.num_cached_tokens or 0 - self.num_computed_tokens = update.num_computed_tokens or 0 - elif update.type == RequestStatsUpdate.Type.PREEMPTED: - self._reset_for_preemption(ts) - elif update.type == RequestStatsUpdate.Type.DECODING: - self.decoding_ts_s_lst.append(ts) - elif update.type == RequestStatsUpdate.Type.DETOKENIZED: - self._record_detokenized_output( - ts, - update.num_new_tokens or 0, - ) - elif update.type == RequestStatsUpdate.Type.FINISHED: - self.finished_ts_s = ts - self.finish_reason = update.finish_reason - else: - raise ValueError(f"Unknown update type: {update.type}") - - def _record_detokenized_output( - self, - ts_s: float, - num_new_tokens: int, - ): - # Update if first output token is generated. - if len(self.output_token_ts_s_lst) == 0: - self.first_token_ts_s = ts_s - assert ( - self.prefill_ts_s is not None - ), "Request must be running before generating output tokens." - - # Some X new tokens were generated at the ts. - self.output_token_ts_s_lst.extend([ts_s] * num_new_tokens) - - def _reset_for_preemption(self, ts_s: float): - self.preempted_ts_s_lst.append(ts_s) - # Reset the computed tokens since it might restart the prefill. - self.num_computed_tokens = 0 - # Cached token count might also change when resumed. - self.num_cached_tokens = 0 - # These stats don't change since they happen before request running. - # - arrival_ts_s - # - input_processor_end_ts_s - # - sampling_params - # - num_prompt_tokens - # - first_token_ts_s - # - # These stats are accumulated over preemptions: - # - output_token_ts_s_lst - # - prefill_start_ts_s_lst (after preemption, it will prefill the - # original prefill tokens and any output tokens generated before - # preemption.) - - -@dataclass -class KVCacheStats: - # KV Cache Usage in % - gpu_cache_usage_sys: float = 0.0 - gpu_prefix_cache_hit_rate: float = 0.0 - - -@dataclass -class SchedulerStats: - """Stats associated with the scheduler.""" - - # Number of requests currently running. - num_running_reqs: int = 0 - # Number of requests currently waiting. - num_waiting_reqs: int = 0 - - kv_cache_stats: KVCacheStats = dataclass_field( - default_factory=KVCacheStats) - - -@dataclass -class EngineCoreProcessStats: - """Stats associated with the engine core process.""" - - # Number of requests currently in the input queue. None if the engine core - # is not running in multiprocess mode. - input_queue_size: Optional[int] = None - # Number of outputs currently in the output queue. None if the engine core - # is not running in multiprocess mode. - output_queue_size: Optional[int] = None - - -class EngineCoreStatsSnapshot( - msgspec.Struct, # type: ignore - array_like=True, - omit_defaults=True, - gc=False): - """ - A snapshot of the EngineCore's current stats over a period of time. - """ - - # Snapshot of the scheduler stats. - scheduler_stats: SchedulerStats = msgspec_field( - default_factory=SchedulerStats) - - # Per request stats updates. - requests_stats_updates: list[RequestStatsUpdate] = msgspec_field( - default_factory=list) - - # Engine core's queue stats. - engine_core_process_stats: EngineCoreProcessStats = msgspec_field( - default_factory=EngineCoreProcessStats) - - # TODO(rickyx): Add other components' stats, - # e.g. model runner/worker and etc. From afe3236e90d6eb2f3aa608b3453d82fc42a02c38 Mon Sep 17 00:00:00 2001 From: Aaron Pham Date: Thu, 15 May 2025 01:00:43 -0400 Subject: [PATCH 18/58] [Chore] astral's ty (#18116) Signed-off-by: Aaron Pham --- docs/source/getting_started/quickstart.md | 4 ++-- pyproject.toml | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/source/getting_started/quickstart.md b/docs/source/getting_started/quickstart.md index 25189b006c26..298ba59f7d8b 100644 --- a/docs/source/getting_started/quickstart.md +++ b/docs/source/getting_started/quickstart.md @@ -19,8 +19,8 @@ If you are using NVIDIA GPUs, you can install vLLM using [pip](https://pypi.org/ It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands: ```console -uv venv myenv --python 3.12 --seed -source myenv/bin/activate +uv venv --python 3.12 --seed +source .venv/bin/activate uv pip install vllm ``` diff --git a/pyproject.toml b/pyproject.toml index 46cf7a801fd6..c3d0440f32b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -170,3 +170,9 @@ plugins.md013.enabled = false # line-length plugins.md041.enabled = false # first-line-h1 plugins.md033.enabled = false # inline-html plugins.md024.allow_different_nesting = true # no-duplicate-headers + +[tool.ty] +respect-ignore-files = true + +[tool.ty.environment] +python = "./.venv" From 2dff093574427b73f360342f2cf34af5328950a5 Mon Sep 17 00:00:00 2001 From: Reid <61492567+reidliu41@users.noreply.github.com> Date: Thu, 15 May 2025 13:02:23 +0800 Subject: [PATCH 19/58] [Misc] add lobe-chat support (#18177) Signed-off-by: reidliu41 Co-authored-by: reidliu41 --- docs/source/deployment/frameworks/index.md | 1 + docs/source/deployment/frameworks/lobe-chat.md | 13 +++++++++++++ 2 files changed, 14 insertions(+) create mode 100644 docs/source/deployment/frameworks/lobe-chat.md diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md index 6708f2c4135f..9744f5f4d362 100644 --- a/docs/source/deployment/frameworks/index.md +++ b/docs/source/deployment/frameworks/index.md @@ -10,6 +10,7 @@ chatbox dify dstack helm +lobe-chat lws modal open-webui diff --git a/docs/source/deployment/frameworks/lobe-chat.md b/docs/source/deployment/frameworks/lobe-chat.md new file mode 100644 index 000000000000..6d86b7fa9cce --- /dev/null +++ b/docs/source/deployment/frameworks/lobe-chat.md @@ -0,0 +1,13 @@ +(deployment-lobe-chat)= + +# Lobe Chat + +[Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework. + +Supports speech-synthesis, multi-modal, and extensible (function call) plugin system. + +One-click FREE deployment of your private OpenAI ChatGPT/Claude/Gemini/Groq/Ollama chat application. + +It supports vLLM as a AI model provider to efficiently serve large language models. + +For details, see the tutorial [Using vLLM in LobeChat](https://lobehub.com/docs/usage/providers/vllm). From 83f74c698f1f7c781ae02e3c533a52432799e717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Luka=20Govedi=C4=8D?= Date: Thu, 15 May 2025 01:04:43 -0400 Subject: [PATCH 20/58] [Fix][ROCm] Enforce eager for all encoder-decoder models on ROCm (#18154) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič --- vllm/config.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 09e89c1116f1..81cac4d04116 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -906,12 +906,17 @@ def _verify_quantization(self) -> None: def _verify_cuda_graph(self) -> None: self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) + # CUDAGraph capture not supported for enc-dec models and mllama on ROCm ROCM_UNSUPPORTED_MODELS = ['mllama'] - if (self.hf_config.model_type in ROCM_UNSUPPORTED_MODELS - and not self.enforce_eager and current_platform.is_rocm()): + unsupported_rocm = (self.hf_config.model_type + in ROCM_UNSUPPORTED_MODELS + or self.is_encoder_decoder) + + if (unsupported_rocm and not self.enforce_eager + and current_platform.is_rocm()): logger.warning( "CUDA graph is not supported for %s on ROCm yet, fallback " - "to the eager mode.", self.hf_config.model_type) + "to eager mode.", self.hf_config.model_type) self.enforce_eager = True def _verify_bnb_config(self) -> None: From 26d041930978aa0d06d565d3e1f1e41686ad0c90 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 15 May 2025 06:06:50 +0100 Subject: [PATCH 21/58] Update deprecated type hinting in `models` (#18132) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- pyproject.toml | 1 - vllm/model_executor/models/arctic.py | 13 +-- vllm/model_executor/models/aria.py | 20 ++--- vllm/model_executor/models/aya_vision.py | 8 +- vllm/model_executor/models/baichuan.py | 15 ++-- vllm/model_executor/models/bamba.py | 15 ++-- vllm/model_executor/models/bart.py | 7 +- vllm/model_executor/models/bert.py | 13 +-- vllm/model_executor/models/bert_with_rope.py | 25 +++--- vllm/model_executor/models/blip.py | 9 +- vllm/model_executor/models/blip2.py | 8 +- vllm/model_executor/models/bloom.py | 9 +- vllm/model_executor/models/chameleon.py | 20 ++--- vllm/model_executor/models/chatglm.py | 11 +-- vllm/model_executor/models/clip.py | 9 +- vllm/model_executor/models/commandr.py | 11 +-- .../models/constant_size_cache.py | 12 +-- vllm/model_executor/models/dbrx.py | 9 +- vllm/model_executor/models/deepseek.py | 15 ++-- vllm/model_executor/models/deepseek_mtp.py | 9 +- vllm/model_executor/models/deepseek_v2.py | 13 +-- vllm/model_executor/models/deepseek_vl2.py | 18 ++-- vllm/model_executor/models/eagle.py | 5 +- vllm/model_executor/models/exaone.py | 19 ++-- vllm/model_executor/models/fairseq2_llama.py | 8 +- vllm/model_executor/models/falcon.py | 13 +-- vllm/model_executor/models/florence2.py | 24 ++--- vllm/model_executor/models/fuyu.py | 6 +- vllm/model_executor/models/gemma.py | 15 ++-- vllm/model_executor/models/gemma2.py | 15 ++-- vllm/model_executor/models/gemma3.py | 15 ++-- vllm/model_executor/models/gemma3_mm.py | 6 +- vllm/model_executor/models/glm4.py | 11 +-- vllm/model_executor/models/gpt2.py | 9 +- vllm/model_executor/models/gpt_bigcode.py | 13 +-- vllm/model_executor/models/gpt_j.py | 13 +-- vllm/model_executor/models/gpt_neox.py | 13 +-- vllm/model_executor/models/granite.py | 17 ++-- vllm/model_executor/models/granite_speech.py | 7 +- vllm/model_executor/models/granitemoe.py | 11 +-- .../model_executor/models/granitemoehybrid.py | 15 ++-- .../model_executor/models/granitemoeshared.py | 11 +-- vllm/model_executor/models/grok1.py | 19 ++-- .../models/idefics2_vision_model.py | 9 +- vllm/model_executor/models/idefics3.py | 8 +- vllm/model_executor/models/interfaces.py | 88 +++++++++---------- vllm/model_executor/models/interfaces_base.py | 26 +++--- vllm/model_executor/models/intern_vit.py | 9 +- vllm/model_executor/models/internlm2.py | 19 ++-- vllm/model_executor/models/internlm2_ve.py | 4 +- vllm/model_executor/models/internvl.py | 6 +- vllm/model_executor/models/jais.py | 9 +- vllm/model_executor/models/jamba.py | 13 +-- vllm/model_executor/models/kimi_vl.py | 9 +- vllm/model_executor/models/llama.py | 19 ++-- vllm/model_executor/models/llama4.py | 27 +++--- vllm/model_executor/models/llama_eagle.py | 10 +-- vllm/model_executor/models/llama_eagle3.py | 13 +-- vllm/model_executor/models/llava.py | 8 +- vllm/model_executor/models/llava_next.py | 15 ++-- .../model_executor/models/llava_next_video.py | 14 +-- vllm/model_executor/models/llava_onevision.py | 21 +++-- vllm/model_executor/models/mamba.py | 17 ++-- vllm/model_executor/models/mamba2.py | 13 +-- vllm/model_executor/models/mamba_cache.py | 5 +- vllm/model_executor/models/medusa.py | 25 +++--- vllm/model_executor/models/mimo.py | 9 +- vllm/model_executor/models/mimo_mtp.py | 9 +- vllm/model_executor/models/minicpm.py | 17 ++-- vllm/model_executor/models/minicpm3.py | 4 +- vllm/model_executor/models/minicpmo.py | 7 +- vllm/model_executor/models/minicpmv.py | 13 ++- vllm/model_executor/models/minimax_text_01.py | 17 ++-- vllm/model_executor/models/minimax_vl_01.py | 6 +- vllm/model_executor/models/mistral3.py | 8 +- vllm/model_executor/models/mixtral.py | 13 +-- vllm/model_executor/models/mixtral_quant.py | 9 +- vllm/model_executor/models/mllama.py | 80 ++++++++--------- vllm/model_executor/models/mllama4.py | 18 ++-- vllm/model_executor/models/mlp_speculator.py | 10 +-- vllm/model_executor/models/modernbert.py | 11 +-- vllm/model_executor/models/module_mapping.py | 18 ++-- vllm/model_executor/models/molmo.py | 32 +++---- vllm/model_executor/models/moonvit.py | 7 +- vllm/model_executor/models/mpt.py | 13 +-- vllm/model_executor/models/nemotron.py | 15 ++-- vllm/model_executor/models/nemotron_nas.py | 17 ++-- vllm/model_executor/models/olmo.py | 11 +-- vllm/model_executor/models/olmo2.py | 7 +- vllm/model_executor/models/olmoe.py | 15 ++-- vllm/model_executor/models/opt.py | 13 +-- vllm/model_executor/models/orion.py | 17 ++-- vllm/model_executor/models/ovis.py | 10 +-- vllm/model_executor/models/paligemma.py | 6 +- vllm/model_executor/models/persimmon.py | 13 +-- vllm/model_executor/models/phi.py | 13 +-- vllm/model_executor/models/phi3_small.py | 17 ++-- vllm/model_executor/models/phi3v.py | 14 +-- vllm/model_executor/models/phi4mm.py | 12 +-- vllm/model_executor/models/phi4mm_audio.py | 4 +- vllm/model_executor/models/phi4mm_utils.py | 4 +- vllm/model_executor/models/phimoe.py | 13 +-- vllm/model_executor/models/pixtral.py | 26 +++--- vllm/model_executor/models/plamo2.py | 7 +- .../models/prithvi_geospatial_mae.py | 8 +- vllm/model_executor/models/qwen.py | 13 +-- vllm/model_executor/models/qwen2.py | 19 ++-- .../models/qwen2_5_omni_thinker.py | 20 ++--- vllm/model_executor/models/qwen2_5_vl.py | 20 ++--- vllm/model_executor/models/qwen2_audio.py | 6 +- vllm/model_executor/models/qwen2_moe.py | 17 ++-- vllm/model_executor/models/qwen2_rm.py | 7 +- vllm/model_executor/models/qwen2_vl.py | 17 ++-- vllm/model_executor/models/qwen3.py | 11 +-- vllm/model_executor/models/qwen3_moe.py | 15 ++-- vllm/model_executor/models/qwen_vl.py | 9 +- vllm/model_executor/models/registry.py | 62 ++++++------- vllm/model_executor/models/roberta.py | 11 +-- vllm/model_executor/models/siglip.py | 11 +-- vllm/model_executor/models/skyworkr1v.py | 6 +- vllm/model_executor/models/smolvlm.py | 4 +- vllm/model_executor/models/solar.py | 13 +-- vllm/model_executor/models/stablelm.py | 15 ++-- vllm/model_executor/models/starcoder2.py | 13 +-- vllm/model_executor/models/telechat2.py | 12 +-- vllm/model_executor/models/transformers.py | 3 +- vllm/model_executor/models/ultravox.py | 6 +- vllm/model_executor/models/utils.py | 50 +++++------ vllm/model_executor/models/whisper.py | 22 ++--- vllm/model_executor/models/zamba2.py | 25 +++--- 130 files changed, 971 insertions(+), 901 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c3d0440f32b5..9465f1e8f059 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,6 @@ exclude = [ "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] "vllm/model_executor/model_loader/**/*.py" = ["UP006", "UP035"] -"vllm/model_executor/models/**/*.py" = ["UP006", "UP035"] "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] "vllm/spec_decode/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py index c518efdb54f8..94a4328564bb 100644 --- a/vllm/model_executor/models/arctic.py +++ b/vllm/model_executor/models/arctic.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Snowflake Arctic model.""" -from typing import Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -458,8 +459,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -467,8 +468,8 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] - mlp_params_mapping: List[Tuple[str, str, int]] = [] - expert_params_mapping: List[Tuple[str, str, int]] = [] + mlp_params_mapping: list[tuple[str, str, int]] = [] + expert_params_mapping: list[tuple[str, str, int]] = [] num_layers = self.config.num_hidden_layers for layer in range(num_layers): @@ -497,7 +498,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("ws", f"experts.{expert_id}.w3.weight", expert_id)) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() logger.info( "It will take ~10 minutes loading from the 16-bit weights. " diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 7c716efab8ef..f74e13888c48 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from typing import List, Optional, Set, Tuple, TypedDict, Union +from typing import Optional, TypedDict, Union import torch import torch.nn as nn @@ -66,8 +66,8 @@ def __init__( # Identity layer self.post_layernorm = nn.Identity() - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -75,7 +75,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # NOTE: post_layernorm is not used in Aria @@ -326,8 +326,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # Adapted from LlamaModel.load_weights with the modification of adding # the expert weights mapping to `stacked_params_mapping` - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -339,7 +339,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("experts.w2_weight", "experts.fc2.weight", 'w2'), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -528,7 +528,7 @@ def __init__( self.vocab_size, logit_scale) def _validate_image_sizes( - self, images: List[torch.Tensor]) -> List[torch.Tensor]: + self, images: list[torch.Tensor]) -> list[torch.Tensor]: if not all(img.shape == images[0].shape for img in images): raise ValueError("All images must be the same size") return images @@ -578,7 +578,7 @@ def _create_patch_attention_mask( def _process_image_input( self, image_input: AriaImagePixelInputs - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: assert self.vision_tower is not None pixel_values = image_input['pixel_values'] @@ -651,6 +651,6 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/aya_vision.py b/vllm/model_executor/models/aya_vision.py index d152287e8fa3..08d49d71eca1 100644 --- a/vllm/model_executor/models/aya_vision.py +++ b/vllm/model_executor/models/aya_vision.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 Adapted from # https://github.com/huggingface/transformers/tree/main/src/transformers/models/aya_vision -from typing import (Iterable, Literal, Mapping, Optional, Sequence, Set, Tuple, - TypedDict, Union, cast) +from collections.abc import Iterable, Mapping, Sequence +from typing import Literal, Optional, TypedDict, Union, cast import torch from torch import nn @@ -315,8 +315,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def dtype(self): return next(self.parameters()).dtype - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py index 444ed38d05c0..077e36176430 100644 --- a/vllm/model_executor/models/baichuan.py +++ b/vllm/model_executor/models/baichuan.py @@ -20,7 +20,8 @@ # limitations under the License. """Inference-only BaiChuan model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -230,7 +231,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -320,15 +321,15 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -421,8 +422,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py index 87e1e102efd8..d6a705fb1859 100644 --- a/vllm/model_executor/models/bamba.py +++ b/vllm/model_executor/models/bamba.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Bamba model.""" # Added by the IBM Team, 2024 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -355,8 +356,8 @@ def forward( hidden_states, _ = self.final_layernorm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -367,7 +368,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -495,7 +496,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size @@ -535,7 +536,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index bcfbe92c3a11..92bbe1bb67a3 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -19,7 +19,8 @@ # limitations under the License. """PyTorch BART model.""" import math -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -859,14 +860,14 @@ def _rename_key(self, key: str): def _rename_stacked_param( self, name: str, - ) -> Tuple[str, Optional[str]]: + ) -> tuple[str, Optional[str]]: for key, mapping in self.stacked_params_mapping.items(): if key in name: name = name.replace(key, mapping["param_name"]) return name, mapping["shard_id"] return name, None - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): model_params_dict = dict(self.model.named_parameters()) top_params_dict = dict(self.named_parameters()) diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 111b49ab8dd2..0c6593bbe3a1 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -349,8 +350,8 @@ def forward( token_type_ids=token_type_ids) return self.encoder(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "query", "q"), @@ -359,7 +360,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if self.pooler is None and "pooler" in name: continue @@ -424,7 +425,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) @@ -472,7 +473,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._pooler = CrossEncodingPooler(config, self.classifier, self.bert.pooler) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): self_weights = [] diff --git a/vllm/model_executor/models/bert_with_rope.py b/vllm/model_executor/models/bert_with_rope.py index 002949abff52..af6deb3bf072 100644 --- a/vllm/model_executor/models/bert_with_rope.py +++ b/vllm/model_executor/models/bert_with_rope.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -208,7 +209,7 @@ def __init__(self, hidden_size: int, moe_num_experts: int, moe_top_k: int): def forward( self, x: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.LongTensor]: weights = self.layer(x.view(-1, x.shape[-1]))[0].softmax( dim=-1, dtype=torch.float32) top_weights, top_experts = torch.topk(weights, self.moe_top_k, dim=-1) @@ -428,8 +429,8 @@ def forward( token_type_ids=token_type_ids) return self.encoder(positions, hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.hf_to_vllm_mapper.apply(weights) if self.config.hidden_act in ["silu", "geglu"]: @@ -442,7 +443,7 @@ def load_weights(self, weights: Iterable[Tuple[str, stacked_params_mapping = [] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "pooler" in name: continue @@ -567,7 +568,7 @@ def config_verify(self, vllm_config): } return config - def split_up_gate_proj(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def split_up_gate_proj(self, weights: Iterable[tuple[str, torch.Tensor]]): n = "mlp.up_gate_proj" for name, weight in weights: if n in name: @@ -578,14 +579,14 @@ def split_up_gate_proj(self, weights: Iterable[Tuple[str, torch.Tensor]]): yield name, weight def ignore_unnecessary_layers(self, - weights: Iterable[Tuple[str, torch.Tensor]]): + weights: Iterable[tuple[str, torch.Tensor]]): for name, weight in weights: if name.startswith("classifier"): continue yield name, weight - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.ignore_unnecessary_layers(weights) weights = self.split_up_gate_proj(weights) return super().load_weights(weights) @@ -664,7 +665,7 @@ def forward( token_type_ids=token_type_ids) @torch.inference_mode() - def jina_merge_lora_weights(self, weights: Iterable[Tuple[str, + def jina_merge_lora_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # use for jina-embeddings-v3 # Merge Lora weights into a single weight tensor. @@ -707,7 +708,7 @@ def jina_merge_lora_weights(self, weights: Iterable[Tuple[str, return [(name, weight) for name, weight in weights.items()] - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.jina_merge_lora_weights(weights) return super().load_weights(weights) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index f3d488926d09..acbc5d04d7e3 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of BlipVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -296,8 +297,8 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return self.post_layernorm(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -305,7 +306,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index f44565bd2e01..2ff7e394a416 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -186,7 +186,7 @@ def forward( self, hidden_states: torch.Tensor, encoder_hidden_states: Optional[torch.FloatTensor] = None, - ) -> Tuple[torch.Tensor]: + ) -> tuple[torch.Tensor]: self_output = self.attention( hidden_states, encoder_hidden_states=encoder_hidden_states, @@ -712,7 +712,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py index 74d401b295ce..eb1085d6b40d 100644 --- a/vllm/model_executor/models/bloom.py +++ b/vllm/model_executor/models/bloom.py @@ -18,7 +18,8 @@ # limitations under the License. """Inference-only BLOOM model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -322,10 +323,10 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name == "lm_head.weight": continue diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py index ef8b033f3846..a4528ca26d01 100644 --- a/vllm/model_executor/models/chameleon.py +++ b/vllm/model_executor/models/chameleon.py @@ -2,7 +2,7 @@ from collections.abc import Iterable, Mapping, Sequence from functools import cached_property -from typing import Any, Dict, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -229,7 +229,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 4096, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -292,7 +292,7 @@ def __init__( prefix=f"{prefix}.attn") def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: # reshape for layernorm q = q.reshape(-1, self.num_heads, self.head_dim) k = k.reshape(-1, self.num_kv_heads, self.head_dim) @@ -367,7 +367,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: if residual is None: residual = hidden_states @@ -438,7 +438,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: residual = hidden_states hidden_states = self.self_attn( @@ -773,7 +773,7 @@ def __init__(self, config: ChameleonVQVAEConfig): def encode( self, pixel_values: torch.Tensor - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: hidden_states = self.encoder(pixel_values) hidden_states = self.quant_conv(hidden_states) quant, emb_loss, indices = self.quantize(hidden_states) @@ -786,7 +786,7 @@ class ChameleonImageVocabularyMapping: A class for mapping discrete image tokens from VQGAN to BPE tokens. """ - def __init__(self, vocab_map: Dict[str, int]): + def __init__(self, vocab_map: dict[str, int]): self.vocab_map = vocab_map self.image_token_id = vocab_map.get("") @@ -1052,8 +1052,8 @@ def compute_logits( return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1063,7 +1063,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py index 233e9ee0a258..4e95afe1a147 100644 --- a/vllm/model_executor/models/chatglm.py +++ b/vllm/model_executor/models/chatglm.py @@ -3,7 +3,8 @@ # https://github.com/THUDM/ChatGLM2-6B """Inference-only ChatGLM model compatible with THUDM weights.""" import json -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -358,15 +359,15 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("linear_proj.merged_proj", "linear_proj.gate_proj", 0), ("linear_proj.merged_proj", "linear_proj.dense_h_to_4h", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -440,7 +441,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 153054e5c028..e8f3ae2156e0 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Minimal implementation of CLIPVisionModel intended to be only used within a vision language model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -368,8 +369,8 @@ def device(self): # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -377,7 +378,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index 8f64e5d5c966..546b5f932877 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -21,7 +21,8 @@ # This file is based on the LLama model definition file in transformers """PyTorch Cohere model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -259,7 +260,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states, residual = self.input_layernorm(hidden_states, residual) @@ -404,8 +405,8 @@ def compute_logits( return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -415,7 +416,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading rotary embeddings since vLLM has its own diff --git a/vllm/model_executor/models/constant_size_cache.py b/vllm/model_executor/models/constant_size_cache.py index d073a7de6917..f1cc7e0f9e29 100644 --- a/vllm/model_executor/models/constant_size_cache.py +++ b/vllm/model_executor/models/constant_size_cache.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod -from typing import Any, Dict, List, Tuple +from typing import Any import torch @@ -16,7 +16,7 @@ class ConstantSizeCache(ABC): def __init__(self, max_batch_size: int): # Maps between the request id and a dict that maps between the seq_id # and its index inside the cache - self.cache_indices_mapping: Dict[str, Dict[int, int]] = {} + self.cache_indices_mapping: dict[str, dict[int, int]] = {} self.free_cache_indices = list(range(max_batch_size)) @property @@ -30,7 +30,7 @@ def _copy_cache(self, from_index: int, to_index: int): """Copy cache data from one index to another""" pass - def current_run_tensors(self, **kwargs) -> Tuple: + def current_run_tensors(self, **kwargs) -> tuple: """ Return the tensors for the current run's conv and ssm state. """ @@ -117,8 +117,8 @@ def _assign_seq_id_to_cache_index(self, cur_rid: str, seq_id: int, return self.cache_indices_mapping[cur_rid][seq_id] def _prepare_current_run_cache( - self, request_ids_to_seq_ids: Dict[str, list[int]], - finished_requests_ids: List[str]) -> List[int]: + self, request_ids_to_seq_ids: dict[str, list[int]], + finished_requests_ids: list[str]) -> list[int]: return [ self._assign_seq_id_to_cache_index(req_id, seq_id, finished_requests_ids) @@ -127,7 +127,7 @@ def _prepare_current_run_cache( ] def _release_finished_requests(self, - finished_seq_groups_req_ids: List[str]): + finished_seq_groups_req_ids: list[str]): for req_id in finished_seq_groups_req_ids: if req_id in self.cache_indices_mapping: for seq_id in self.cache_indices_mapping[req_id]: diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 850fba2604e1..e0b4712cdb47 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -414,14 +415,14 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: expert_params_mapping = [( "w13" if weight_name in ["w1", "v1"] else "w2", f"mlp.{weight_name}", ) for weight_name in ["w1", "v1", "w2"]] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py index c6421143dd68..88d1ca9f7b83 100644 --- a/vllm/model_executor/models/deepseek.py +++ b/vllm/model_executor/models/deepseek.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Deepseek model.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -184,7 +185,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -385,8 +386,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -397,7 +398,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -478,7 +479,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) \ No newline at end of file diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py index b50175cf764f..6d7b52aba5f9 100644 --- a/vllm/model_executor/models/deepseek_mtp.py +++ b/vllm/model_executor/models/deepseek_mtp.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -176,8 +177,8 @@ def compute_logits( return self.model.compute_logits(hidden_states, sampling_metadata, spec_step_idx) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("gate_up_proj", "gate_proj", 0), ("gate_up_proj", "up_proj", 1), @@ -190,7 +191,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index 680b7e614dd6..b78c193c1345 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only DeepseekV2/DeepseekV3 model.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -200,7 +201,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -352,7 +353,7 @@ def __init__( q_lora_rank: Optional[int], kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -736,8 +737,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "gate_proj", 0), @@ -753,7 +754,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.n_routed_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 6d8f27530cee..164fa40ffebe 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -4,7 +4,7 @@ """Inference-only Deepseek-VL2 model compatible with HuggingFace weights.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -45,7 +45,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, num_channels, height, width)` """ @@ -57,7 +57,7 @@ class DeepseekVL2ImagePixelInputs(TypedDict): class DeepseekVL2VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -394,8 +394,8 @@ def _init_vision_module( return model def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.vision_config.image_size expected_dims = (3, h, w) @@ -415,8 +415,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_images_spatial_crop( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: expected_dims = 2 def _validate_shape(d: torch.Tensor): @@ -640,8 +640,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py index 4ff1e785494f..726660796a6f 100644 --- a/vllm/model_executor/models/eagle.py +++ b/vllm/model_executor/models/eagle.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -183,7 +184,7 @@ def compute_logits(self, hidden_states: torch.Tensor, return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # This implementation is incompitable with https://huggingface.co/yuhuili/EAGLE-LLaMA3-Instruct-8B # due to missing lm_head weights and its config being that of a # Llama model. Here's a compatible version with the same weights: diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index 4a6490cd127a..4ffd06319684 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -24,7 +24,8 @@ # limitations under the License. """Inference-only Exaone model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -102,7 +103,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -196,7 +197,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -282,7 +283,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -384,8 +385,8 @@ def forward( hidden_states, _ = self.ln_f(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -395,7 +396,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".c_fc_1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -535,8 +536,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # With tie_word_embeddings, we can skip lm_head.weight diff --git a/vllm/model_executor/models/fairseq2_llama.py b/vllm/model_executor/models/fairseq2_llama.py index 310aca999bc2..00dbbebb120e 100644 --- a/vllm/model_executor/models/fairseq2_llama.py +++ b/vllm/model_executor/models/fairseq2_llama.py @@ -16,7 +16,7 @@ # limitations under the License. """Llama model for fairseq2 weights.""" -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch from torch.nn import Parameter @@ -44,8 +44,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): f"model.{self.tp_rank}.pt", ] - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: # fairseq2's serialization adds a wrapper to usual .pt state_dict's: # { "model_key": my_model_name, "my_model_name": state_dict } # which we first need to unpack @@ -102,7 +102,7 @@ def reshape_fairseq2_weights( name: str, loaded_weight: torch.Tensor, params: dict[str, Parameter], - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: """Reshape fairseq2's weights.""" def permute(w: torch.Tensor, n_heads: int) -> torch.Tensor: diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py index e7e03fc09972..376793594f8b 100644 --- a/vllm/model_executor/models/falcon.py +++ b/vllm/model_executor/models/falcon.py @@ -20,7 +20,8 @@ """PyTorch Falcon model.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -394,8 +395,8 @@ def forward( hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: total_num_heads = self.config.num_attention_heads if self.config.new_decoder_architecture: total_num_kv_heads = self.config.num_kv_heads @@ -405,7 +406,7 @@ def load_weights(self, weights: Iterable[Tuple[str, total_num_kv_heads = total_num_heads num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: @@ -498,8 +499,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py index d1a36c3f481a..f8acc56706d2 100644 --- a/vllm/model_executor/models/florence2.py +++ b/vllm/model_executor/models/florence2.py @@ -3,7 +3,7 @@ import math from collections import OrderedDict from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -713,8 +713,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -723,7 +723,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -922,8 +922,8 @@ def _build_image_projection_layers(self, config: PretrainedConfig): 'Florence2 only supports COSINE as temporal embedding.') def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: size = self.processor_config["size"] h, w = size["height"], size["width"] @@ -944,12 +944,12 @@ def _validate_shape(d: torch.Tensor): return data def _parse_and_validate_image_input(self, **kwargs: object): - pixel_values: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + pixel_values: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + image_embeds: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) @@ -1096,7 +1096,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py index d6bd6155a447..fbad7f56d0ba 100644 --- a/vllm/model_executor/models/fuyu.py +++ b/vllm/model_executor/models/fuyu.py @@ -18,7 +18,7 @@ """ PyTorch Fuyu model.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict +from typing import Literal, Optional, TypedDict import torch import torch.nn as nn @@ -382,7 +382,7 @@ def compute_logits( self.language_model.lm_head, hidden_states, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py index c1cc0df11178..0f6d94e7518b 100644 --- a/vllm/model_executor/models/gemma.py +++ b/vllm/model_executor/models/gemma.py @@ -15,8 +15,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Gemma model compatible with HuggingFace weights.""" +from collections.abc import Iterable from functools import cache -from typing import Iterable, Optional, Set, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -231,7 +232,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -318,8 +319,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -329,7 +330,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: @@ -413,8 +414,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 7fb2e9948c06..b46716213c62 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -15,7 +15,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -218,7 +219,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -305,8 +306,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -316,7 +317,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -413,8 +414,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index 4e0d4f84ca6b..3a88adcce0bd 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -14,7 +14,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn.functional as F @@ -320,7 +321,7 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], **kwargs, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -412,8 +413,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -423,7 +424,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -521,8 +522,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py index 65c177f8c5ad..743542ec8dfa 100644 --- a/vllm/model_executor/models/gemma3_mm.py +++ b/vllm/model_executor/models/gemma3_mm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, Set, Tuple, TypedDict +from typing import Any, Literal, Optional, TypedDict import torch from torch import nn @@ -701,8 +701,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 290be968cb54..f351ce5a0681 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GLM-4-0414 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -60,7 +61,7 @@ def __init__(self, rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER) -> None: super().__init__() @@ -183,7 +184,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -293,8 +294,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py index e3219333915e..470a7053e1b6 100644 --- a/vllm/model_executor/models/gpt2.py +++ b/vllm/model_executor/models/gpt2.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-2 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -280,10 +281,10 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ".attn.bias" in name or ".attn.masked_bias" in name: # Skip attention mask. diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py index def6b1544d8c..6a1d97bd7b69 100644 --- a/vllm/model_executor/models/gpt_bigcode.py +++ b/vllm/model_executor/models/gpt_bigcode.py @@ -19,7 +19,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPTBigCode model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -243,10 +244,10 @@ def forward( hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ".attn.bias" in name: # Skip attention mask. @@ -327,8 +328,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."]), diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index 3db96fb8e187..69fdd90cfbe8 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -17,7 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-J model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -228,8 +229,8 @@ def forward( hidden_states = self.ln_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -239,7 +240,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "attn.bias" in name or "attn.masked_bias" in name: continue @@ -331,7 +332,7 @@ def compute_logits( sampling_metadata, self.lm_head.bias) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) \ No newline at end of file diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index 620ee66f57e7..401fa9f5cc8b 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -17,7 +17,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GPT-NeoX model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -240,10 +241,10 @@ def forward( hidden_states = self.final_layer_norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if ("attention.bias" in name or "attention.masked_bias" in name or "rotary_emb.inv_freq" in name): @@ -324,7 +325,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 0696a7245c22..eed0820a5779 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only IBM Granite model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -97,7 +98,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -230,7 +231,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -321,8 +322,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -332,7 +333,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -475,8 +476,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: skip_prefixes = [ "rotary_emb.inv_freq", # Models trained using ColossalAI may include these tensors in diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index b43b59da6d11..512ec55177d8 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only IBM Granite speeech model.""" import math -from typing import Iterable, Mapping, Optional, Set, Tuple, TypedDict, Union +from collections.abc import Iterable, Mapping +from typing import Optional, TypedDict, Union import torch import torch.nn.functional as F @@ -763,8 +764,8 @@ def compute_logits( def load_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], - ) -> Set[str]: + weights: Iterable[tuple[str, torch.Tensor]], + ) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index 7fff14cb9f12..f342dfff824f 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only GraniteMoe model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -305,8 +306,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: new_weights = {} for n, p in weights: if n.endswith('.block_sparse_moe.input_linear.weight'): @@ -425,8 +426,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 706e648f1b4f..443b102c9968 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only GraniteMoeHybrid model.""" # Added by the IBM Team, 2025 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -381,10 +382,10 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() def _load(n, p): param = params_dict[n] @@ -538,7 +539,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size @@ -578,7 +579,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/granitemoeshared.py b/vllm/model_executor/models/granitemoeshared.py index 4e660cbf667b..817e6091d276 100644 --- a/vllm/model_executor/models/granitemoeshared.py +++ b/vllm/model_executor/models/granitemoeshared.py @@ -4,7 +4,8 @@ The architecture is the same as granitemoe but with the addition of shared experts. """ -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -208,8 +209,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: new_weights = {} for n, p in weights: if n.endswith('.block_sparse_moe.input_linear.weight'): @@ -329,8 +330,8 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index 6f56eb2d5e38..6d2d16d098d4 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Grok1 model.""" -from typing import Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn.functional as F @@ -263,7 +264,7 @@ def forward( kv_cache: torch.Tensor, attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -340,7 +341,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, @@ -371,8 +372,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -390,7 +391,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and @@ -528,7 +529,7 @@ def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, - kv_caches: List[torch.Tensor], + kv_caches: list[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[torch.Tensor] = None, @@ -547,8 +548,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: skip_prefixes = ["rotary_emb.inv_freq"] # Skip lm_head when tie_word_embeddings is True if self.config.tie_word_embeddings: diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py index cb0379c10f3a..b8bdc7aa32b2 100644 --- a/vllm/model_executor/models/idefics2_vision_model.py +++ b/vllm/model_executor/models/idefics2_vision_model.py @@ -17,7 +17,8 @@ # limitations under the License. """PyTorch Idefics2 model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -342,8 +343,8 @@ def forward( last_hidden_state = self.post_layernorm(encoder_outputs) return last_hidden_state - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -351,7 +352,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py index 961954c2b584..fdb128ef5b54 100644 --- a/vllm/model_executor/models/idefics3.py +++ b/vllm/model_executor/models/idefics3.py @@ -17,7 +17,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import Dict, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -85,7 +85,7 @@ class Idefics3ProcessingInfo(BaseProcessingInfo): def get_hf_processor( self, *, - size: Optional[Dict[str, int]] = None, + size: Optional[dict[str, int]] = None, **kwargs: object, ) -> Idefics3Processor: if size is not None: @@ -752,8 +752,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 7fea9647ead9..8f33a3e29c60 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, - Protocol, Type, Union, overload, runtime_checkable) +from typing import (TYPE_CHECKING, ClassVar, Literal, Optional, Protocol, + Union, overload, runtime_checkable) import torch from torch import Tensor @@ -102,7 +102,7 @@ class _SupportsMultiModalType(Protocol): @overload def supports_multimodal( - model: Type[object]) -> TypeIs[Type[SupportsMultiModal]]: + model: type[object]) -> TypeIs[type[SupportsMultiModal]]: ... @@ -112,8 +112,8 @@ def supports_multimodal(model: object) -> TypeIs[SupportsMultiModal]: def supports_multimodal( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsMultiModal]], TypeIs[SupportsMultiModal]]: if isinstance(model, type): return isinstance(model, _SupportsMultiModalType) @@ -134,9 +134,9 @@ class SupportsLoRA(Protocol): """ # The `embedding_module` and `embedding_padding_modules` # are empty by default. - embedding_modules: ClassVar[Dict[str, str]] = {} - embedding_padding_modules: ClassVar[List[str]] = [] - packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} + embedding_modules: ClassVar[dict[str, str]] = {} + embedding_padding_modules: ClassVar[list[str]] = [] + packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} # We can't use runtime_checkable with ClassVar for issubclass checks @@ -145,13 +145,13 @@ class SupportsLoRA(Protocol): class _SupportsLoRAType(Protocol): supports_lora: Literal[True] - packed_modules_mapping: Dict[str, List[str]] - embedding_modules: Dict[str, str] - embedding_padding_modules: List[str] + packed_modules_mapping: dict[str, list[str]] + embedding_modules: dict[str, str] + embedding_padding_modules: list[str] @overload -def supports_lora(model: Type[object]) -> TypeIs[Type[SupportsLoRA]]: +def supports_lora(model: type[object]) -> TypeIs[type[SupportsLoRA]]: ... @@ -161,8 +161,8 @@ def supports_lora(model: object) -> TypeIs[SupportsLoRA]: def supports_lora( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsLoRA]], TypeIs[SupportsLoRA]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsLoRA]], TypeIs[SupportsLoRA]]: result = _supports_lora(model) if not result: @@ -191,7 +191,7 @@ def supports_lora( return result -def _supports_lora(model: Union[Type[object], object]) -> bool: +def _supports_lora(model: Union[type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsLoRAType) @@ -256,7 +256,7 @@ def forward( @overload -def supports_pp(model: Type[object]) -> TypeIs[Type[SupportsPP]]: +def supports_pp(model: type[object]) -> TypeIs[type[SupportsPP]]: ... @@ -266,8 +266,8 @@ def supports_pp(model: object) -> TypeIs[SupportsPP]: def supports_pp( - model: Union[Type[object], object], -) -> Union[bool, TypeIs[Type[SupportsPP]], TypeIs[SupportsPP]]: + model: Union[type[object], object], +) -> Union[bool, TypeIs[type[SupportsPP]], TypeIs[SupportsPP]]: supports_attributes = _supports_pp_attributes(model) supports_inspect = _supports_pp_inspect(model) @@ -298,14 +298,14 @@ def supports_pp( return supports_attributes and supports_inspect -def _supports_pp_attributes(model: Union[Type[object], object]) -> bool: +def _supports_pp_attributes(model: Union[type[object], object]) -> bool: if isinstance(model, type): return isinstance(model, _SupportsPPType) return isinstance(model, SupportsPP) -def _supports_pp_inspect(model: Union[Type[object], object]) -> bool: +def _supports_pp_inspect(model: Union[type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -336,13 +336,13 @@ def has_inner_state(model: object) -> TypeIs[HasInnerState]: @overload -def has_inner_state(model: Type[object]) -> TypeIs[Type[HasInnerState]]: +def has_inner_state(model: type[object]) -> TypeIs[type[HasInnerState]]: ... def has_inner_state( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[HasInnerState]], TypeIs[HasInnerState]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[HasInnerState]], TypeIs[HasInnerState]]: if isinstance(model, type): return isinstance(model, _HasInnerStateType) @@ -373,13 +373,13 @@ def is_attention_free(model: object) -> TypeIs[IsAttentionFree]: @overload -def is_attention_free(model: Type[object]) -> TypeIs[Type[IsAttentionFree]]: +def is_attention_free(model: type[object]) -> TypeIs[type[IsAttentionFree]]: ... def is_attention_free( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[IsAttentionFree]], TypeIs[IsAttentionFree]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[IsAttentionFree]], TypeIs[IsAttentionFree]]: if isinstance(model, type): return isinstance(model, _IsAttentionFreeType) @@ -410,13 +410,13 @@ def is_hybrid(model: object) -> TypeIs[IsHybrid]: @overload -def is_hybrid(model: Type[object]) -> TypeIs[Type[IsHybrid]]: +def is_hybrid(model: type[object]) -> TypeIs[type[IsHybrid]]: ... def is_hybrid( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[IsHybrid]], TypeIs[IsHybrid]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[IsHybrid]], TypeIs[IsHybrid]]: if isinstance(model, type): return isinstance(model, _IsHybridType) @@ -439,13 +439,13 @@ def has_noops(model: object) -> TypeIs[HasNoOps]: @overload -def has_noops(model: Type[object]) -> TypeIs[Type[HasNoOps]]: +def has_noops(model: type[object]) -> TypeIs[type[HasNoOps]]: ... def has_noops( - model: Union[Type[object], object] -) -> Union[TypeIs[Type[HasNoOps]], TypeIs[HasNoOps]]: + model: Union[type[object], object] +) -> Union[TypeIs[type[HasNoOps]], TypeIs[HasNoOps]]: if isinstance(model, type): return isinstance(model, _HasNoOpsType) @@ -461,7 +461,7 @@ class SupportsCrossEncoding(Protocol): @overload def supports_cross_encoding( - model: Type[object]) -> TypeIs[Type[SupportsCrossEncoding]]: + model: type[object]) -> TypeIs[type[SupportsCrossEncoding]]: ... @@ -471,8 +471,8 @@ def supports_cross_encoding(model: object) -> TypeIs[SupportsCrossEncoding]: def _supports_cross_encoding( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: if isinstance(model, type): return isinstance(model, SupportsCrossEncoding) @@ -481,15 +481,15 @@ def _supports_cross_encoding( def supports_cross_encoding( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsCrossEncoding]], TypeIs[SupportsCrossEncoding]]: return is_pooling_model(model) and _supports_cross_encoding(model) class SupportsQuant: """The interface required for all models that support quantization.""" - packed_modules_mapping: ClassVar[Dict[str, List[str]]] = {} + packed_modules_mapping: ClassVar[dict[str, list[str]]] = {} quant_config: Optional[QuantizationConfig] = None def __new__(cls, *args, **kwargs) -> Self: @@ -525,7 +525,7 @@ class SupportsTranscription(Protocol): @overload def supports_transcription( - model: Type[object]) -> TypeIs[Type[SupportsTranscription]]: + model: type[object]) -> TypeIs[type[SupportsTranscription]]: ... @@ -535,8 +535,8 @@ def supports_transcription(model: object) -> TypeIs[SupportsTranscription]: def supports_transcription( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsTranscription]], TypeIs[SupportsTranscription]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsTranscription]], TypeIs[SupportsTranscription]]: if isinstance(model, type): return isinstance(model, SupportsTranscription) @@ -551,7 +551,7 @@ class SupportsV0Only(Protocol): @overload -def supports_v0_only(model: Type[object]) -> TypeIs[Type[SupportsV0Only]]: +def supports_v0_only(model: type[object]) -> TypeIs[type[SupportsV0Only]]: ... @@ -561,8 +561,8 @@ def supports_v0_only(model: object) -> TypeIs[SupportsV0Only]: def supports_v0_only( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[SupportsV0Only]], TypeIs[SupportsV0Only]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[SupportsV0Only]], TypeIs[SupportsV0Only]]: if isinstance(model, type): return isinstance(model, SupportsV0Only) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index f141dcf3cd4f..d325a6b67132 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import (TYPE_CHECKING, Optional, Protocol, Type, Union, overload, +from typing import (TYPE_CHECKING, Optional, Protocol, Union, overload, runtime_checkable) import torch @@ -20,7 +20,7 @@ # The type of hidden states # Currently, T = torch.Tensor for all models except for Medusa -# which has T = List[torch.Tensor] +# which has T = list[torch.Tensor] T = TypeVar("T", default=torch.Tensor) T_co = TypeVar("T_co", default=torch.Tensor, covariant=True) @@ -48,12 +48,12 @@ def forward( ... -def _check_vllm_model_init(model: Union[Type[object], object]) -> bool: +def _check_vllm_model_init(model: Union[type[object], object]) -> bool: model_init = model.__init__ return supports_kw(model_init, "vllm_config") -def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: +def _check_vllm_model_forward(model: Union[type[object], object]) -> bool: model_forward = getattr(model, "forward", None) if not callable(model_forward): return False @@ -75,7 +75,7 @@ def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool: @overload -def is_vllm_model(model: Type[object]) -> TypeIs[Type[VllmModel]]: +def is_vllm_model(model: type[object]) -> TypeIs[type[VllmModel]]: ... @@ -85,8 +85,8 @@ def is_vllm_model(model: object) -> TypeIs[VllmModel]: def is_vllm_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModel]], TypeIs[VllmModel]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModel]], TypeIs[VllmModel]]: return _check_vllm_model_init(model) and _check_vllm_model_forward(model) @@ -105,7 +105,7 @@ def compute_logits( @overload def is_text_generation_model( - model: Type[object]) -> TypeIs[Type[VllmModelForTextGeneration]]: + model: type[object]) -> TypeIs[type[VllmModelForTextGeneration]]: ... @@ -116,8 +116,8 @@ def is_text_generation_model( def is_text_generation_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModelForTextGeneration]], + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModelForTextGeneration]], TypeIs[VllmModelForTextGeneration]]: if not is_vllm_model(model): return False @@ -142,7 +142,7 @@ def pooler( @overload -def is_pooling_model(model: Type[object]) -> TypeIs[Type[VllmModelForPooling]]: +def is_pooling_model(model: type[object]) -> TypeIs[type[VllmModelForPooling]]: ... @@ -152,8 +152,8 @@ def is_pooling_model(model: object) -> TypeIs[VllmModelForPooling]: def is_pooling_model( - model: Union[Type[object], object], -) -> Union[TypeIs[Type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: + model: Union[type[object], object], +) -> Union[TypeIs[type[VllmModelForPooling]], TypeIs[VllmModelForPooling]]: if not is_vllm_model(model): return False diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index fdcef8b9be8d..d9d9002bd5ba 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -6,8 +6,9 @@ # Copyright (c) 2023 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- +from collections.abc import Iterable from functools import partial -from typing import Iterable, Optional, Set, Tuple +from typing import Optional import torch import torch.nn as nn @@ -461,10 +462,10 @@ def forward( return encoder_outputs - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index c3d7cbfcddbb..3f3e3966e838 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 +from collections.abc import Iterable from functools import partial -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Type, Union +from typing import Any, Optional, Union import torch from torch import nn @@ -81,7 +82,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -225,7 +226,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -252,7 +253,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - layer_type: Type[InternLMDecoderLayer] = InternLMDecoderLayer): + layer_type: type[InternLMDecoderLayer] = InternLMDecoderLayer): super().__init__() config = vllm_config.model_config.hf_config @@ -316,7 +317,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "", - model_type: Type[InternLM2Model] = InternLM2Model): + model_type: type[InternLM2Model] = InternLM2Model): super().__init__() config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config @@ -361,15 +362,15 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w1", 0), ("gate_up_proj", "w3", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -407,7 +408,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - model_type: Type[InternLM2Model] = InternLM2Model, + model_type: type[InternLM2Model] = InternLM2Model, ): super().__init__(vllm_config=vllm_config, prefix=prefix, diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py index 69b0caab8f8e..6893d0239121 100644 --- a/vllm/model_executor/models/internlm2_ve.py +++ b/vllm/model_executor/models/internlm2_ve.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -66,7 +66,7 @@ def forward( hidden_states: torch.Tensor, residual: Optional[torch.Tensor], visual_token_mask: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 23b92ad2bbf6..66e78fcc4e80 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -8,7 +8,7 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union +from typing import Literal, Optional, TypedDict, TypeVar, Union import torch import torch.nn as nn @@ -932,8 +932,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: # unused modules appear in OpenGVLab/InternVideo2_5_Chat_8B skip_prefixes = [ "action_embed", "temporal_embed", "track_embed", diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py index e1e3f0f199c5..d6a1e0bb4845 100644 --- a/vllm/model_executor/models/jais.py +++ b/vllm/model_executor/models/jais.py @@ -21,7 +21,8 @@ """Inference-only Jais model compatible with HuggingFace weights.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -333,10 +334,10 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "lm_head.weight" in name: # GPT-2 ties the weights of the embedding layer and the final diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index 46335c2b3930..6f9fa60c9b05 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only Jamba model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -442,7 +443,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = self.config.hidden_size conv_state_shape = ( @@ -464,8 +465,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -482,7 +483,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -583,7 +584,7 @@ def pooler( logits = self.score(hidden_states) return self._pooler(logits, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): # TODO: The reward weights themselves have float32 accuracy data, we # would like to load them in fp32 to get that extra precision. super().load_weights(weights) diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 0629266860fd..b575f44765a8 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -43,10 +43,9 @@ import copy import math -from collections.abc import Mapping +from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass -from typing import (Any, Iterable, List, Literal, Optional, Sequence, Tuple, - TypedDict, Union) +from typing import Any, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -120,7 +119,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor: class KimiVLImagePixelInputs(TypedDict): type: Literal["pixel_values"] - pixel_values: Union[torch.Tensor, List[torch.Tensor]] + pixel_values: Union[torch.Tensor, list[torch.Tensor]] """ Shape:`(num_patches, num_channels, patch_size, patch_size)` """ @@ -447,7 +446,7 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata, **kwargs) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): config = self.config.text_config _KEYS_TO_MODIFY_MAPPING = { "language_model.lm_head": "lm_head", diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index c1593dcbe344..c15c0213b520 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -103,7 +104,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -285,7 +286,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -394,8 +395,8 @@ def forward( return hidden_states, aux_hidden_states return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -405,7 +406,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -582,8 +583,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -599,7 +600,7 @@ def maybe_remap_mistral( self, name: str, loaded_weight: torch.Tensor, - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: def permute(w: torch.Tensor, n_heads: int): attn_in = self.config.head_dim * n_heads diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index dfd0804f21cf..40fdd84d8fb0 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -16,7 +16,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only LLaMA model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Any, Optional import torch from torch import nn @@ -48,7 +49,7 @@ def custom_routing_function( gating_output: torch.Tensor, topk: int, renormalize: bool, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: router_scores, router_indices = fast_topk(gating_output, topk, dim=-1) # psuedo-standard is that the router scores are floats router_scores = torch.sigmoid(router_scores.float()) @@ -115,7 +116,7 @@ def __init__(self, num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -300,7 +301,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -335,9 +336,9 @@ def load_moe_expert_weights( self, name: str, loaded_weight: torch.Tensor, - params_dict: Dict[str, nn.Parameter], - loaded_params: Set[str], - expert_params_mapping: List[Tuple[str, str, int, str]], + params_dict: dict[str, nn.Parameter], + loaded_params: set[str], + expert_params_mapping: list[tuple[str, str, int, str]], fused: bool = True, ) -> bool: expert_param_loaded = False @@ -390,8 +391,8 @@ def load_moe_expert_weights( expert_param_loaded = True return expert_param_loaded - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -412,7 +413,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ckpt_up_proj_name="gate_up_proj", num_experts=1) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "experts.gate_up_proj" in name or "experts.down_proj" in name: fused_experts_params = True @@ -489,8 +490,8 @@ def _init_model(self, prefix=prefix, layer_type=layer_type) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -506,7 +507,7 @@ def permute_qk_weight_for_rotary( self, name: str, loaded_weight: torch.Tensor, - ) -> Tuple[str, torch.Tensor]: + ) -> tuple[str, torch.Tensor]: def permute(w: torch.Tensor, n_heads: int): attn_in = self.config.head_dim * n_heads diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 4e51daa220e4..018ecc2a8c0f 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -92,8 +92,8 @@ def forward( hidden_states = hidden_states + residual return hidden_states, hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -103,7 +103,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -150,7 +150,7 @@ def forward( ) -> tuple[torch.Tensor, torch.Tensor]: return self.model(input_ids, positions, hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, skip_prefixes=None, diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 9761c8389db2..2302d1352de6 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -56,7 +57,7 @@ def forward( embeds: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: residual = hidden_states embeds = self.input_layernorm(embeds) @@ -140,8 +141,8 @@ def forward( hidden_states, hidden_prenorm = self.norm(hidden_states, residual) return hidden_states, hidden_prenorm - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -151,7 +152,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if 'midlayer.' in name: name = name.replace('midlayer.', 'layers.0.') @@ -228,7 +229,7 @@ def combine_hidden_states( # combine multiple auxiliary hidden states returned by eagle3 return self.model.fc(hidden_states) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader( self, skip_prefixes=None, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6287fdb3300c..95c1a0ca0b98 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -2,8 +2,8 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, - TypeVar, Union, cast) +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union, cast) import torch import torch.nn as nn @@ -751,8 +751,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index c7e8d6991b25..e731f1bfdb9a 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,8 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 from abc import abstractmethod -from typing import (Final, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, TypedDict, TypeVar, Union) +from collections.abc import Iterable, Mapping +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union) import torch import torch.nn as nn @@ -266,8 +267,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -450,7 +451,7 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaNextImageInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -577,7 +578,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py index a5ff189cfdb5..9303ea121727 100644 --- a/vllm/model_executor/models/llava_next_video.py +++ b/vllm/model_executor/models/llava_next_video.py @@ -2,7 +2,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -35,7 +35,7 @@ class LlavaNextVideoPixelInputs(TypedDict): type: Literal["pixel_values_videos"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size, num_frames, num_channels, height, width)` @@ -300,8 +300,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.language_model.model.make_empty_intermediate_tensors) def _validate_video_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -326,7 +326,7 @@ def _parse_and_validate_video_input( A legal video input should have the following dimensions: { "pixel_values_videos" : - List[b, Tensor(nb_frames, nb_channels, height, width)] + list[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values_videos = kwargs.pop("pixel_values_videos", None) @@ -460,8 +460,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # This model doesn't support images for now diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py index 5c2b388e403d..49f1ecb4be89 100644 --- a/vllm/model_executor/models/llava_onevision.py +++ b/vllm/model_executor/models/llava_onevision.py @@ -2,8 +2,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, List, Literal, Optional, Protocol, Set, Tuple, - TypedDict, Union) +from typing import Final, Literal, Optional, Protocol, TypedDict, Union import torch import torch.nn as nn @@ -471,8 +470,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_image_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -530,8 +529,8 @@ def _parse_and_validate_image_input( raise AssertionError("This line should be unreachable.") def _validate_video_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) @@ -557,7 +556,7 @@ def _parse_and_validate_video_input( A legal video input should have the following dimensions: { "pixel_values_videos" : - List[b, Tensor(nb_frames, nb_channels, height, width)] + list[b, Tensor(nb_frames, nb_channels, height, width)] } """ pixel_values_videos = kwargs.pop("pixel_values_videos", None) @@ -706,7 +705,7 @@ def _merge_image_patch_embeddings(self, def _process_image_pixels( self, inputs: LlavaOnevisionImagePixelInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: assert self.vision_tower is not None pixel_values = inputs["pixel_values"] @@ -735,7 +734,7 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaOnevisionImageInputs, - ) -> Union[torch.Tensor, List[torch.Tensor]]: + ) -> Union[torch.Tensor, list[torch.Tensor]]: if image_input["type"] == "image_embeds": return [image_input["data"]] @@ -948,7 +947,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py index af78ece66bbe..ce76a76b6574 100644 --- a/vllm/model_executor/models/mamba.py +++ b/vllm/model_executor/models/mamba.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -30,7 +31,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class MambaDecoderLayer(nn.Module): @@ -153,10 +154,10 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") @@ -247,7 +248,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape = ( self.config.intermediate_size // world_size, @@ -265,7 +266,7 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mamba2.py b/vllm/model_executor/models/mamba2.py index 72daf34c4412..858a1633befa 100644 --- a/vllm/model_executor/models/mamba2.py +++ b/vllm/model_executor/models/mamba2.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 """PyTorch MAMBA2 model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -35,7 +36,7 @@ make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) -KVCache = Tuple[torch.Tensor, torch.Tensor] +KVCache = tuple[torch.Tensor, torch.Tensor] class Mamba2DecoderLayer(nn.Module): @@ -241,7 +242,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() conv_state_shape, temporal_state_shape = None, None @@ -279,10 +280,10 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "A_log" in name: name = name.replace("A_log", "A") diff --git a/vllm/model_executor/models/mamba_cache.py b/vllm/model_executor/models/mamba_cache.py index 25839727898f..47d0ef9cc6bb 100644 --- a/vllm/model_executor/models/mamba_cache.py +++ b/vllm/model_executor/models/mamba_cache.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from dataclasses import dataclass -from typing import Tuple import torch @@ -25,8 +24,8 @@ def at_layer_idx(self, layer_idx): class MambaCacheManager(ConstantSizeCache): def __init__(self, vllm_config: VllmConfig, dtype: torch.dtype, - num_mamba_layers: int, conv_state_shape: Tuple[int, int], - temporal_state_shape: Tuple[int, int]): + num_mamba_layers: int, conv_state_shape: tuple[int, int], + temporal_state_shape: tuple[int, int]): # Determine max batch size to set size of MambaCache max_batch_size = vllm_config.scheduler_config.max_num_seqs diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index a19d7da5654b..ac0b281f359c 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, List, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -96,13 +97,13 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: # checkpoint file has token_map tensor. self.token_map = None - def forward(self, hidden_states: torch.Tensor) -> List[torch.Tensor]: + def forward(self, hidden_states: torch.Tensor) -> list[torch.Tensor]: return [block(hidden_states) for block in self.blocks] def compute_logits( - self, hidden_states: List[torch.Tensor], - sampling_metadata: SamplingMetadata) -> List[torch.Tensor]: - logits_lst: List[torch.Tensor] = [] + self, hidden_states: list[torch.Tensor], + sampling_metadata: SamplingMetadata) -> list[torch.Tensor]: + logits_lst: list[torch.Tensor] = [] for hs, lm_head in zip(hidden_states, self.lm_heads): _logits = self.logits_processor(lm_head, hs, sampling_metadata) @@ -127,9 +128,9 @@ def compute_logits( def sample( self, - logits: List[torch.Tensor], + logits: list[torch.Tensor], sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: logits = torch.stack(logits, dim=0).float() logprobs = torch.log_softmax(logits, dim=-1) token_ids = logits.argmax(-1) # support only top-1 for now @@ -144,7 +145,7 @@ def sample( token_prob_list.append(probs[:, seq_group.sample_indices]) token_logprob_list.append(logprobs[:, seq_group.sample_indices]) - outputs: List[Optional[SamplerOutput]] = [] + outputs: list[Optional[SamplerOutput]] = [] for idx in range(len(sampling_metadata.seq_groups)): outputs.append( SamplerOutput( @@ -160,7 +161,7 @@ def generate_proposals( self, previous_hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: return self.sample( logits=self.compute_logits( hidden_states=self.forward(previous_hidden_states), @@ -169,10 +170,10 @@ def generate_proposals( sampling_metadata=sampling_metadata, ) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() weights_map = {} diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index b882aeebb08d..49ea64e029d6 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -24,7 +24,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiMo model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -87,8 +88,8 @@ def forward( hidden_states = hidden_states + residual return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), @@ -97,7 +98,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "mtp_layers" in name: continue diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py index c2f1cf4112fe..adcfcaa6b1e6 100644 --- a/vllm/model_executor/models/mimo_mtp.py +++ b/vllm/model_executor/models/mimo_mtp.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiMo-MTP model.""" -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch import torch.nn as nn @@ -193,8 +194,8 @@ def sample( next_tokens = self.sampler(logits, sampling_metadata) return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), @@ -204,7 +205,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index 866dc3f466e7..d99ae81468a9 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only MiniCPM model compatible with HuggingFace weights.""" import math -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -190,7 +191,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -329,7 +330,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -428,8 +429,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -446,7 +447,7 @@ def load_weights(self, weights: Iterable[Tuple[str, for weight_name in ["w1", "w2", "w3"] ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -582,8 +583,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py index 1b24c38cef1b..2a6867d12d99 100644 --- a/vllm/model_executor/models/minicpm3.py +++ b/vllm/model_executor/models/minicpm3.py @@ -23,7 +23,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only MiniCPM3 model compatible with HuggingFace weights.""" -from typing import Any, Dict, Optional +from typing import Any, Optional import torch from torch import nn @@ -58,7 +58,7 @@ def __init__( q_lora_rank: int, kv_lora_rank: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, diff --git a/vllm/model_executor/models/minicpmo.py b/vllm/model_executor/models/minicpmo.py index f42d48e919cd..ae5df0f9273f 100644 --- a/vllm/model_executor/models/minicpmo.py +++ b/vllm/model_executor/models/minicpmo.py @@ -23,8 +23,7 @@ # limitations under the License. """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -559,8 +558,8 @@ def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""): self.audio_encoder_layer = -1 return model - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["tts"]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 300360f785ae..04cc7e35e345 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -26,8 +26,7 @@ from collections import defaultdict from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import numpy as np import torch @@ -118,7 +117,7 @@ def __init__(self, num_heads: int, kv_dim: Optional[int] = None, norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, - max_size: Tuple[int, int] = (70, 70), + max_size: tuple[int, int] = (70, 70), quant_config: Optional[QuantizationConfig] = None, prefix: str = "") -> None: super().__init__(num_queries, @@ -133,7 +132,7 @@ def __init__(self, self._set_2d_pos_cache(self.max_size) def _set_2d_pos_cache(self, - max_size: Tuple[int, int], + max_size: tuple[int, int], device: torch.types.Device = "cpu") -> None: pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim, max_size, @@ -203,7 +202,7 @@ def forward(self, x: torch.Tensor, return x -def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]: +def get_version_by_config(config: PretrainedConfig) -> tuple[int, ...]: version_float = getattr(config, "version", None) # The old configs do not include version number @@ -938,8 +937,8 @@ def compute_logits( ) -> Optional[torch.Tensor]: return self.llm.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py index 951f4e2304a1..0285402dadf7 100644 --- a/vllm/model_executor/models/minimax_text_01.py +++ b/vllm/model_executor/models/minimax_text_01.py @@ -3,7 +3,8 @@ import copy import math import re -from typing import Dict, Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.distributed @@ -127,7 +128,7 @@ def forward( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: assert residual is None, "RMSNorm does not support residual connection." return self._forward(x) @@ -178,7 +179,7 @@ def forward( positions: torch.Tensor, query: torch.Tensor, key: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: from vllm import _custom_ops as ops self.cos_sin_cache = self.cos_sin_cache.to(positions.device) query_cast = query.to(self.cache_dtype) @@ -708,11 +709,11 @@ def __init__( def forward(self, hidden_states: torch.Tensor, positions: torch.Tensor, - kv_caches: Union[List[Dict], Optional[torch.Tensor]], + kv_caches: Union[list[dict], Optional[torch.Tensor]], attn_metadata: AttentionMetadata, residual: Optional[torch.Tensor], is_warmup: bool = False, - **kwargs) -> Tuple[torch.Tensor, torch.Tensor]: + **kwargs) -> tuple[torch.Tensor, torch.Tensor]: forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata @@ -1072,10 +1073,10 @@ def make_empty_intermediate_tensors( device=device), }) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() def which_layer(name: str) -> int: if "layers" in name: diff --git a/vllm/model_executor/models/minimax_vl_01.py b/vllm/model_executor/models/minimax_vl_01.py index 4ac60f97bb5f..14c1250ca3b4 100644 --- a/vllm/model_executor/models/minimax_vl_01.py +++ b/vllm/model_executor/models/minimax_vl_01.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping -from typing import Literal, Optional, Set, Tuple, TypedDict, Union, cast +from typing import Literal, Optional, TypedDict, Union, cast import torch import torch.nn as nn @@ -357,7 +357,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py index 42ec786f3a59..2b9cbf10440a 100644 --- a/vllm/model_executor/models/mistral3.py +++ b/vllm/model_executor/models/mistral3.py @@ -2,8 +2,8 @@ from abc import abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import (Final, Literal, Optional, Protocol, Set, Tuple, TypedDict, - TypeVar, Union) +from typing import (Final, Literal, Optional, Protocol, TypedDict, TypeVar, + Union) import torch import torch.nn as nn @@ -589,8 +589,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index 1513c8dad097..1968bf9e68af 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -314,8 +315,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -332,7 +333,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -479,7 +480,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py index 7c022a5b8f68..4de83d12be6a 100644 --- a/vllm/model_executor/models/mixtral_quant.py +++ b/vllm/model_executor/models/mixtral_quant.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Mixtral model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import numpy as np import torch @@ -397,8 +398,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -407,7 +408,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 0c1d61c01f91..713c9e8d203f 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -16,7 +16,7 @@ """PyTorch Mllama model.""" import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import numpy as np import torch @@ -224,7 +224,7 @@ def apply( return mm_inputs - def _get_num_image_in_last_group(self, prompt_token_ids: List[int]) -> int: + def _get_num_image_in_last_group(self, prompt_token_ids: list[int]) -> int: num_images = 0 for token_id in prompt_token_ids[::-1]: if token_id == self.info.get_hf_config().image_token_index: @@ -370,8 +370,8 @@ def __init__( self, in_channels: int, out_channels: int, - kernel_size: Union[int, Tuple[int, int]], - stride: Union[int, Tuple[int, int]], + kernel_size: Union[int, tuple[int, int]], + stride: Union[int, tuple[int, int]], bias: bool = False, ) -> None: super().__init__() @@ -603,7 +603,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, - ) -> Union[Tuple, BaseModelOutput]: + ) -> Union[BaseModelOutput]: encoder_states = () for i, encoder_layer in enumerate(self.layers): @@ -878,7 +878,7 @@ def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], + kv_range_for_decode: Optional[list[tuple[int, int]]], cross_attention_states: Optional[torch.Tensor], ) -> torch.Tensor: q, k, v = self.qkv_proj(hidden_states, cross_attention_states) @@ -905,7 +905,7 @@ def _attention_with_mask( k: torch.Tensor, v: torch.Tensor, attention_mask: torch.Tensor, - kv_range_for_decode: List[Tuple[int, int]], + kv_range_for_decode: list[tuple[int, int]], ) -> torch.Tensor: kv_cache = self.attn.kv_cache[self.pipeline_parallel_rank] attn_metadata: AttentionMetadata = get_forward_context().attn_metadata @@ -1019,7 +1019,7 @@ def forward( hidden_states: torch.Tensor, cross_attention_states: torch.Tensor, cross_attention_mask: torch.Tensor, - kv_range_for_decode: Optional[List[Tuple[int, int]]], + kv_range_for_decode: Optional[list[tuple[int, int]]], full_text_row_masked_out_mask: torch.Tensor, ) -> torch.Tensor: residual = hidden_states @@ -1089,8 +1089,8 @@ def forward( positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], - full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, + kv_range_for_decode: Optional[list[tuple[int, int]]], + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1150,8 +1150,8 @@ def forward( positions: Optional[torch.LongTensor], cross_attention_states: Optional[torch.LongTensor], cross_attention_mask: Optional[torch.LongTensor], - kv_range_for_decode: Optional[List[Tuple[int, int]]], - full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, + kv_range_for_decode: Optional[list[tuple[int, int]]], + full_text_row_masked_out_mask: Optional[tuple[torch.Tensor, torch.Tensor]], skip_cross_attention: bool, ) -> torch.Tensor: @@ -1221,7 +1221,7 @@ def compute_logits( return logits def unpack_data(self, - image_data: Union[List[torch.Tensor], torch.Tensor], + image_data: Union[list[torch.Tensor], torch.Tensor], padding_value=0) -> torch.Tensor: if isinstance(image_data, torch.Tensor): # torch.Tensor @@ -1230,7 +1230,7 @@ def unpack_data(self, assert isinstance( image_data[0], torch.Tensor), "Image data is not properly batched." - # List[torch.Tensor] + # list[torch.Tensor] bsz = len(image_data) max_length = max(t.size(0) for t in image_data) trailing_dims = image_data[0].shape[1:] @@ -1248,24 +1248,24 @@ def unpack_data(self, def _parse_and_validate_image_input(self, **kwargs: object): # tensor with the same shape will be batched together by # MultiModalKwargs.batch, so pixel_values here can be: - # - List[torch.Tensor]: + # - list[torch.Tensor]: # with shape (num_image, num_tiles, 3, image_res, image_res) # - torch.Tensor: # with shape (bs, num_image, num_tiles, 3, image_res, image_res) - pixel_values: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + pixel_values: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "pixel_values", None) - image_embeds: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + image_embeds: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "image_embeds", None) - aspect_ratio_ids: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + aspect_ratio_ids: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_ids", None) - aspect_ratio_mask: Optional[Union[List[List[torch.Tensor]], - List[torch.Tensor], + aspect_ratio_mask: Optional[Union[list[list[torch.Tensor]], + list[torch.Tensor], torch.Tensor]] = kwargs.pop( "aspect_ratio_mask", None) @@ -1293,10 +1293,10 @@ def _parse_and_validate_image_input(self, **kwargs: object): def _get_and_validate_encoder_lens( self, - encoder_seq_lens: List[int], - num_tiles: List[List[int]], + encoder_seq_lens: list[int], + num_tiles: list[list[int]], num_tokens_per_tile: int, - ) -> List[int]: + ) -> list[int]: # Get the actual number of encoder tokens for each sample. # Because attn_metadata.encoder_seq_lens only counts the last # group of images for each sample, which is used to cheat the @@ -1318,7 +1318,7 @@ def _get_and_validate_encoder_lens( def flat_encoder_result(self, cross_attention_states: torch.Tensor, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: List[int]): + actual_encoder_seq_lens: list[int]): cross_attention_states_flat = torch.zeros( sum(actual_encoder_seq_lens), @@ -1342,8 +1342,8 @@ def get_cross_attention_states( self, image_inputs: MllamaImagePixelInputs, attn_metadata: AttentionMetadata, - actual_encoder_seq_lens: List[int], - ) -> Tuple[torch.Tensor]: + actual_encoder_seq_lens: list[int], + ) -> tuple[torch.Tensor]: # NOTE: llama's reference implementation runs vision model on CPU pixel_values = image_inputs['data'] aspect_ratio_ids = image_inputs['aspect_ratio_ids'] @@ -1367,10 +1367,10 @@ def get_cross_attention_mask( self, input_ids: torch.Tensor, attn_metadata: AttentionMetadata, - num_tiles: List[List[int]], + num_tiles: list[list[int]], num_tokens_per_tile: int, dtype: torch.dtype, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: token_ids = input_ids.tolist() start = 0 batch_token_ids = [] @@ -1422,7 +1422,7 @@ def forward( input_ids: torch.Tensor, positions: torch.Tensor, **kwargs: object, - ) -> Union[Tuple, CausalLMOutputWithPast]: + ) -> Union[CausalLMOutputWithPast]: attn_metadata = get_forward_context().attn_metadata if attn_metadata.num_prefill_tokens > 0 and \ attn_metadata.num_decode_tokens > 0: @@ -1476,8 +1476,8 @@ def forward( return outputs - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1487,7 +1487,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - updated_params: Set[str] = set() + updated_params: set[str] = set() for name, loaded_weight in weights: if 'patch_embedding.weight' in name: name = name.replace('patch_embedding.weight', @@ -1538,7 +1538,7 @@ def get_mm_mapping(self) -> MultiModelKeys: tower_model="vision_model") -def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: +def skip_attention_mask(sparse_mask: list[list[int]]) -> bool: for mask in sparse_mask: # Skip text-only samples. if len(mask) == 0: @@ -1556,10 +1556,10 @@ def skip_attention_mask(sparse_mask: List[List[int]]) -> bool: def convert_sparse_cross_attention_mask_to_dense( - sparse_mask: List[List[List[int]]], - num_tiles: List[List[int]], - lengths: List[int], -) -> Tuple[np.ndarray, List[Tuple[int, int]]]: + sparse_mask: list[list[list[int]]], + num_tiles: list[list[int]], + lengths: list[int], +) -> tuple[np.ndarray, list[tuple[int, int]]]: total_length = sum(lengths) total_tiles = sum([sum(tiles) for tiles in num_tiles]) dense_mask = np.zeros(shape=(total_length, total_tiles), dtype=np.int64) diff --git a/vllm/model_executor/models/mllama4.py b/vllm/model_executor/models/mllama4.py index 741b9837398c..8c98492c0bed 100644 --- a/vllm/model_executor/models/mllama4.py +++ b/vllm/model_executor/models/mllama4.py @@ -18,7 +18,7 @@ import math from collections.abc import Iterable, Mapping from itertools import tee -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -582,7 +582,7 @@ def _get_prompt_updates( mm_items: MultiModalDataItems, hf_processor_mm_kwargs: Mapping[str, object], out_mm_kwargs: MultiModalKwargs, - ) -> List[PromptUpdate]: + ) -> list[PromptUpdate]: assert ( mm_items.get_count("image", strict=False) == 0 or "aspect_ratios" in out_mm_kwargs @@ -778,26 +778,26 @@ def compute_logits( def separate_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], prefix: str, - ) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[ + ) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[ str, torch.Tensor]]]: weights1, weights2 = tee(weights, 2) - def get_prefix_weights() -> Iterable[Tuple[str, torch.Tensor]]: + def get_prefix_weights() -> Iterable[tuple[str, torch.Tensor]]: for name, data in weights1: if name.startswith(prefix): yield (name, data) - def get_other_weights() -> Iterable[Tuple[str, torch.Tensor]]: + def get_other_weights() -> Iterable[tuple[str, torch.Tensor]]: for name, data in weights2: if not name.startswith(prefix): yield (name, data) return get_prefix_weights(), get_other_weights() - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) @@ -806,7 +806,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), ] params_dict = dict(self.named_parameters()) - updated_params: Set[str] = set() + updated_params: set[str] = set() # language_model is an Llama4ForCausalLM instance. We load it's # using llama4's load_weights routine. diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py index 2920427f94f7..a7d7aa7d44ef 100644 --- a/vllm/model_executor/models/mlp_speculator.py +++ b/vllm/model_executor/models/mlp_speculator.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Iterable, List, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -148,7 +148,7 @@ def generate_proposals( previous_hidden_states: torch.Tensor, num_predict_tokens: int, sampling_metadata: SamplingMetadata, - ) -> List[SamplerOutput]: + ) -> list[SamplerOutput]: if num_predict_tokens > self.max_speculative_tokens: raise ValueError(f"Max speculative tokens for model is " f"{self.max_speculative_tokens}, but " @@ -190,10 +190,10 @@ def generate_proposals( return next_tokens - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: name = name.replace("speculator.", "") param = params_dict.get(name) diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 73effb207bce..86552aa05bf9 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Iterable, Optional, Set, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -212,11 +213,11 @@ def __init__( eps=config.norm_eps, bias=config.norm_bias) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: weights = self.hf_to_vllm_mapper.apply(weights) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: continue @@ -280,7 +281,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self._pooler = CrossEncodingPooler(config, self.classifier, ModernBertPooler(config)) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): self_weights = [] diff --git a/vllm/model_executor/models/module_mapping.py b/vllm/model_executor/models/module_mapping.py index 23814e6322d2..25e6f594069e 100644 --- a/vllm/model_executor/models/module_mapping.py +++ b/vllm/model_executor/models/module_mapping.py @@ -4,7 +4,7 @@ # https://github.com/modelscope/ms-swift/blob/v2.4.2/swift/utils/module_mapping.py from dataclasses import dataclass, field -from typing import List, Union +from typing import Union @dataclass @@ -46,17 +46,17 @@ class ModelKeys: @dataclass class MultiModelKeys(ModelKeys): - language_model: List[str] = field(default_factory=list) - connector: List[str] = field(default_factory=list) + language_model: list[str] = field(default_factory=list) + connector: list[str] = field(default_factory=list) # vision tower and audio tower - tower_model: List[str] = field(default_factory=list) - generator: List[str] = field(default_factory=list) + tower_model: list[str] = field(default_factory=list) + generator: list[str] = field(default_factory=list) @staticmethod - def from_string_field(language_model: Union[str, List[str]] = None, - connector: Union[str, List[str]] = None, - tower_model: Union[str, List[str]] = None, - generator: Union[str, List[str]] = None, + def from_string_field(language_model: Union[str, list[str]] = None, + connector: Union[str, list[str]] = None, + tower_model: Union[str, list[str]] = None, + generator: Union[str, list[str]] = None, **kwargs) -> 'MultiModelKeys': def to_list(value): diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 42bbb77a22c0..e215582a37ac 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -4,7 +4,7 @@ from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass from functools import cached_property, partial -from typing import List, Optional, Set, Tuple, TypedDict, Union +from typing import Optional, TypedDict, Union import numpy as np import torch @@ -90,7 +90,7 @@ class MolmoImageInputs(TypedDict): @dataclass class VisionBackboneConfig: - image_default_input_size: Tuple[int, int] = (336, 336) + image_default_input_size: tuple[int, int] = (336, 336) image_patch_size: int = 14 image_pos_patch_size: int = 14 image_emb_dim: int = 1024 @@ -267,7 +267,7 @@ def __init__( for _ in range(config.image_num_layers) ]) - def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + def forward(self, x: torch.Tensor) -> list[torch.Tensor]: hidden_states = [] for r in self.resblocks: x = r(x) @@ -334,7 +334,7 @@ def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor: def forward(self, x: torch.Tensor, - patch_num: Optional[int] = None) -> List[torch.Tensor]: + patch_num: Optional[int] = None) -> list[torch.Tensor]: """ : param x: (batch_size, num_patch, n_pixels) """ @@ -434,7 +434,7 @@ def __init__( ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -570,7 +570,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Self Attention if residual is None: residual = hidden_states @@ -596,7 +596,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Self Attention residual = hidden_states hidden_states = self.self_attn( @@ -740,15 +740,15 @@ def forward( # image_features: (batch_size, num_image, num_patch, d_model) return image_features - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("merged_linear", "gate_proj", 0), ("merged_linear", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -855,10 +855,10 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: @@ -1530,7 +1530,7 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): loader = AutoWeightsLoader(self) weights = _get_weights_with_merged_embedding(weights) @@ -1548,8 +1548,8 @@ def get_mm_mapping(self) -> MultiModelKeys: def _get_weights_with_merged_embedding( - weights: Iterable[Tuple[str, torch.Tensor]] -) -> Iterable[Tuple[str, torch.Tensor]]: + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: embedding_weights = {} for name, weight in weights: if "wte.embedding" in name: diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index c367d90f847b..9f11d4a42273 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -42,9 +42,10 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. import math +from collections.abc import Sequence from copy import deepcopy from functools import cached_property -from typing import List, Optional, Sequence, Tuple, Union +from typing import Optional, Union import torch import torch.nn as nn @@ -222,7 +223,7 @@ def __init__( self, out_dim: int, in_dim: int = 3, - patch_size: Union[int, Tuple[int, int]] = (14, 14), + patch_size: Union[int, tuple[int, int]] = (14, 14), pos_emb_height: int = 14, pos_emb_width: int = 14, ): @@ -526,7 +527,7 @@ def patch_merger( x: torch.Tensor, grid_hw: torch.Tensor, merge_kernel_size: list[int, int] = (2, 2), -) -> List[torch.Tensor]: +) -> list[torch.Tensor]: d_model = x.size(-1) outputs = [] diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py index 77bd794058cd..6c396d778ae7 100644 --- a/vllm/model_executor/models/mpt.py +++ b/vllm/model_executor/models/mpt.py @@ -2,7 +2,8 @@ # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch import torch.nn as nn @@ -265,10 +266,10 @@ def forward( hidden_states = self.norm_f(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: @@ -323,7 +324,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 5208c0796c8d..862c53535e8a 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Nemotron model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -69,7 +70,7 @@ def _cast_if_autocast_enabled(*args): class NemotronLayerNorm1P(nn.LayerNorm): def __init__(self, - normalized_shape: Union[int, List[int], torch.Size], + normalized_shape: Union[int, list[int], torch.Size], eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True, @@ -133,7 +134,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -267,7 +268,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -441,8 +442,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -450,7 +451,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".qkv_proj", ".v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index 988b994b7689..f4d5a77f2086 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only deci model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Type, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -135,7 +136,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if self._is_no_op_attention: @@ -168,7 +169,7 @@ def __init__( *, vllm_config: VllmConfig, prefix: str = "", - layer_type: Type[DeciLMDecoderLayer] = DeciLMDecoderLayer, + layer_type: type[DeciLMDecoderLayer] = DeciLMDecoderLayer, ): super().__init__() @@ -260,8 +261,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -271,7 +272,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -428,8 +429,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py index 0781ca168f84..a36b62cd2284 100644 --- a/vllm/model_executor/models/olmo.py +++ b/vllm/model_executor/models/olmo.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMo model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -209,7 +210,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[tuple[torch.Tensor, torch.Tensor]]]: # Attention block. residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -338,8 +339,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -349,7 +350,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 422b53d86f11..a41a959cdb04 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -23,8 +23,9 @@ # limitations under the License. """Inference-only OLMo2 model compatible with HuggingFace weights.""" +from collections.abc import Iterable from functools import partial -from typing import Iterable, Optional, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -135,7 +136,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): ) def _apply_qk_norm(self, q: torch.Tensor, - k: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + k: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if self.tp_size > 1: q = tensor_model_parallel_all_gather(q.contiguous()) k = tensor_model_parallel_all_gather(k.contiguous()) @@ -365,7 +366,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index e6925e125690..9a07f57fd999 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OLMoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -102,7 +103,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 4096, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -307,8 +308,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -327,7 +328,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -439,8 +440,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=["rotary_emb.inv_freq"], diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py index d258eddae25d..8376d62410d4 100644 --- a/vllm/model_executor/models/opt.py +++ b/vllm/model_executor/models/opt.py @@ -18,7 +18,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only OPT model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -312,8 +313,8 @@ def forward( intermediate_tensors, inputs_embeds=inputs_embeds) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -321,7 +322,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -400,8 +401,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head.weight"] diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py index 8d9c000750d7..1ccd1fe1f741 100644 --- a/vllm/model_executor/models/orion.py +++ b/vllm/model_executor/models/orion.py @@ -5,7 +5,8 @@ # Copyright (c) OrionStar Inc. # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE """Inference-only Orion-14B model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -72,7 +73,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, @@ -186,7 +187,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -259,8 +260,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -270,7 +271,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -341,8 +342,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=([ diff --git a/vllm/model_executor/models/ovis.py b/vllm/model_executor/models/ovis.py index 5204c751216f..e03705d48f3e 100644 --- a/vllm/model_executor/models/ovis.py +++ b/vllm/model_executor/models/ovis.py @@ -17,8 +17,8 @@ # limitations under the License. """ PyTorch Ovis model.""" import math -from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple, - TypedDict, Union) +from collections.abc import Iterable, Mapping +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -211,7 +211,7 @@ class OvisImagePatchInputs(TypedDict): `(batch_size * (num_patches + 1))` """ - patches_per_image: List[int] + patches_per_image: list[int] """ List of number of total patches for each image in the batch. This is used to restore the first two dimensions of `flat_data`. @@ -545,8 +545,8 @@ def compute_logits( logits = self.llm.compute_logits(hidden_states, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 8699ae52622d..427005e9b704 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch from torch import nn @@ -391,7 +391,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py index eacf02433b57..d46b95fea5a8 100644 --- a/vllm/model_executor/models/persimmon.py +++ b/vllm/model_executor/models/persimmon.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only persimmon model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -260,10 +261,10 @@ def forward( hidden_states = self.final_layernorm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if is_pp_missing_parameter(name, self): continue @@ -336,7 +337,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index fc2b108bad97..330ad5c59448 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -36,7 +36,8 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """Inference-only Phi-1.5 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -248,8 +249,8 @@ def forward( return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -257,7 +258,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v") ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: @@ -348,7 +349,7 @@ def compute_logits( sampling_metadata, self.lm_head.bias) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py index 338e87b4285f..d00d7d886d67 100644 --- a/vllm/model_executor/models/phi3_small.py +++ b/vllm/model_executor/models/phi3_small.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -230,8 +231,8 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], - Optional[Tuple[torch.Tensor]]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor], + Optional[tuple[torch.Tensor]]]: qkv, _ = self.query_key_value(hidden_states) qkv = qkv.view(qkv.shape[:-1] + @@ -352,10 +353,10 @@ def forward( hidden_states = self.final_layernorm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if name.endswith(".bias") and name not in params_dict: continue @@ -454,8 +455,8 @@ def forward( output_hidden_states = output_hidden_states return output_hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head.weight"] diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index a1442251b992..bb4d46be3f99 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -16,7 +16,7 @@ # limitations under the License. import re from collections.abc import Iterable, Mapping, Sequence -from typing import Any, List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -94,7 +94,7 @@ def _init_img_processor(hf_config: PretrainedConfig, class Phi3VImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -113,7 +113,7 @@ class Phi3VImagePixelInputs(TypedDict): class Phi3VImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -571,8 +571,8 @@ def _validate_shape(d: torch.Tensor): return data def _validate_pixel_values( - self, data: Union[torch.Tensor, List[torch.Tensor]] - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, data: Union[torch.Tensor, list[torch.Tensor]] + ) -> Union[torch.Tensor, list[torch.Tensor]]: h = w = CLIP_VIT_LARGE_PATCH14_336_CONFIG.image_size expected_dims = (3, h, w) @@ -707,8 +707,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index e5ff9ceddef7..fd154940ea7f 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import math from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import numpy as np import torch @@ -392,7 +392,7 @@ def forward(self, pixel_values: torch.FloatTensor, class Phi4MMImagePixelInputs(TypedDict): type: Literal["pixel_values"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """ Shape: `(batch_size * num_images, 1 + num_patches, num_channels, height, width)` @@ -417,7 +417,7 @@ class Phi4MMImagePixelInputs(TypedDict): class Phi4MMImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_images, image_feature_size, hidden_size)` `hidden_size` must match the hidden size of language model backbone. @@ -426,7 +426,7 @@ class Phi4MMImageEmbeddingInputs(TypedDict): class Phi4MMAudioFeatureInputs(TypedDict): type: Literal["audio_features"] - data: Union[torch.Tensor, List[torch.Tensor]] + data: Union[torch.Tensor, list[torch.Tensor]] """Shape: `(batch_size * num_audios, 80, M)""" @@ -1031,7 +1031,7 @@ def _process_audio_input(self, audio_input: Phi4MMAudioInputs, return audio_embeds def _parse_and_validate_image_input(self, - **kwargs: object) -> Optional[Dict]: + **kwargs: object) -> Optional[dict]: input_image_embeds: NestedTensors = kwargs.get("input_image_embeds") if input_image_embeds is None: return None @@ -1238,7 +1238,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> None: weights = ((name, data) for name, data in weights if "lora" not in name) diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 34a7a73d057a..609746b48588 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -6,7 +6,7 @@ #!/usr/bin/env python3 import abc import math -from typing import List, Literal, Optional +from typing import Literal, Optional import numpy as np import torch @@ -746,7 +746,7 @@ class ConformerEncoder(TransformerEncoderBase): attention_group_size = attenion_heads = Multi-Query Attention """ - extra_multi_layer_output_idxs: List[int] + extra_multi_layer_output_idxs: list[int] def __init__( # pylint: disable-all self, diff --git a/vllm/model_executor/models/phi4mm_utils.py b/vllm/model_executor/models/phi4mm_utils.py index 4051763cec8c..f468fdbd5417 100644 --- a/vllm/model_executor/models/phi4mm_utils.py +++ b/vllm/model_executor/models/phi4mm_utils.py @@ -5,7 +5,7 @@ # but implemented by the Phi-Speech team #!/usr/bin/env python3 import math -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch import torch.nn.functional as F @@ -1586,7 +1586,7 @@ def forward( memory: Optional[Tensor] = None, pos_emb: Optional[Tensor] = None, att_mask: Optional[Tensor] = None, - ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: + ) -> tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: """AttModule forward Args: diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 2dc55e4c352e..7f2e9fdf7c4e 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -22,7 +22,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only PhiMoE model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -505,8 +506,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -521,7 +522,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_local_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if (self.quant_config is not None and (scale_name := self.quant_config.get_cache_scale(name))): @@ -657,8 +658,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["rotary_emb.inv_freq"]), diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index c0b492dbfcb9..c664d2371e27 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -4,7 +4,7 @@ from collections.abc import Iterable, Mapping, Sequence from dataclasses import dataclass, fields from functools import cached_property -from typing import List, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -438,18 +438,18 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - def is_vision_encoder_weights(weight: Tuple[str, torch.Tensor]): + def is_vision_encoder_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith("vision_encoder") - def is_vision_lang_adapter_weights(weight: Tuple[str, torch.Tensor]): + def is_vision_lang_adapter_weights(weight: tuple[str, torch.Tensor]): return weight[0].startswith("vision_language_adapter") - def is_patch_merger(weight: Tuple[str, torch.Tensor]): + def is_patch_merger(weight: tuple[str, torch.Tensor]): return weight[0].startswith("patch_merger") - def is_pre_mm_projector_norm(weight: Tuple[str, torch.Tensor]): + def is_pre_mm_projector_norm(weight: tuple[str, torch.Tensor]): return weight[0].startswith("pre_mm_projector_norm") # Get references to parameters for direct loading @@ -566,7 +566,7 @@ def apply_rotary_emb_vit( xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor, -) -> Tuple[torch.Tensor, torch.Tensor]: +) -> tuple[torch.Tensor, torch.Tensor]: xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) assert freqs_cis.dtype == torch.complex64 @@ -671,7 +671,7 @@ def forward( return x -def position_meshgrid(patch_embeds_list: List[torch.Tensor], ) -> torch.Tensor: +def position_meshgrid(patch_embeds_list: list[torch.Tensor], ) -> torch.Tensor: positions = torch.cat([ torch.stack( torch.meshgrid( @@ -733,7 +733,7 @@ def freqs_cis(self) -> torch.Tensor: def forward( self, - images: List[torch.Tensor], + images: list[torch.Tensor], ) -> torch.Tensor: """ Args: @@ -1023,7 +1023,7 @@ def forward( hidden_states: torch.Tensor, attention_mask: torch.Tensor, position_embeddings: torch.Tensor, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: batch, patches, _ = hidden_states.size() qkv_states, _ = self.qkv_proj(hidden_states) @@ -1249,8 +1249,8 @@ def forward( # (TODO) Add prefix argument for filtering out weights to be loaded # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -1260,7 +1260,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.transformer.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/plamo2.py b/vllm/model_executor/models/plamo2.py index 790c48ccd216..55a65f8078a4 100644 --- a/vllm/model_executor/models/plamo2.py +++ b/vllm/model_executor/models/plamo2.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Inference-only PLaMo2 model.""" import math -from typing import Iterable, Optional, Tuple +from collections.abc import Iterable +from typing import Optional import torch from torch import nn @@ -659,7 +660,7 @@ def get_seqlen_agnostic_capture_inputs(self, batch_size: int): return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: world_size = get_tensor_model_parallel_world_size() hidden_size = (self.config.mamba_num_heads * self.config.hidden_size_per_head) @@ -682,7 +683,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/prithvi_geospatial_mae.py b/vllm/model_executor/models/prithvi_geospatial_mae.py index c10ef45440b1..40ac5e30a368 100644 --- a/vllm/model_executor/models/prithvi_geospatial_mae.py +++ b/vllm/model_executor/models/prithvi_geospatial_mae.py @@ -16,7 +16,7 @@ # limitations under the License. """Inference-only IBM/NASA Prithvi Geospatial model.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Optional, Set, Tuple, Union +from typing import Optional, Union import torch import torch.nn as nn @@ -154,7 +154,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): "by PrithviGeospatialMAE.") def _parse_and_validate_multimodal_data( - self, **kwargs) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + self, **kwargs) -> tuple[torch.Tensor, Optional[torch.Tensor]]: pixel_values = kwargs.pop("pixel_values", None) if not isinstance(pixel_values, torch.Tensor): @@ -195,8 +195,8 @@ def pooler( ) -> Optional[PoolerOutput]: return PoolerOutput([PoolingSequenceGroupOutput(hidden_states)]) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: params_list = [] model_buffers = dict(self.named_buffers()) loaded_buffers = [] diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index e75294bc6cba..2fda87a4ff0f 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -6,7 +6,8 @@ # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" import json -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -76,7 +77,7 @@ def __init__( num_heads: int, max_position_embeddings: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", @@ -166,7 +167,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -284,15 +285,15 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("gate_up_proj", "w2", 0), ("gate_up_proj", "w1", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 60f8a7cd7270..108d002e601b 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2 model compatible with HuggingFace weights.""" -from typing import Any, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -108,7 +109,7 @@ def __init__( rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER, dual_chunk_attention_config: Optional[dict[str, Any]] = None, @@ -245,7 +246,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -367,8 +368,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -378,7 +379,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue @@ -490,8 +491,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] @@ -559,7 +560,7 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) weights = ((name, data) for name, data in weights if not name.startswith("lm_head.")) diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index d8e178f9cd47..d89b822dd873 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -21,10 +21,10 @@ # limitations under the License. """Inference-only Qwen2.5-Omni model (thinker part).""" +from collections.abc import Iterable, Mapping, Sequence from copy import copy from functools import partial -from typing import (Any, Dict, Iterable, List, Mapping, Optional, Sequence, - Set, Tuple, Union) +from typing import Any, Optional, Union import torch import torch.nn as nn @@ -138,7 +138,7 @@ def get_hf_processor( min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, List[float]]] = None, + fps: Optional[Union[float, list[float]]] = None, **kwargs: object, ) -> Qwen2_5OmniProcessor: if fps is not None: @@ -550,7 +550,7 @@ def _parse_and_validate_audio_input( def _parse_and_validate_image_input( self, - **kwargs: Dict[str, Any], + **kwargs: dict[str, Any], ) -> Optional[Qwen2_5_VLImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_embeds = kwargs.pop("image_embeds", None) @@ -589,7 +589,7 @@ def _parse_and_validate_image_input( def _parse_and_validate_video_input( self, - **kwargs: Dict[str, Any], + **kwargs: dict[str, Any], ) -> Optional[Qwen2_5_VLVideoInputs]: pixel_values_videos = kwargs.pop("pixel_values_videos", None) video_embeds = kwargs.pop("video_embeds", None) @@ -627,7 +627,7 @@ def _parse_and_validate_video_input( def _process_audio_input( self, audio_input: Qwen2AudioInputs, - audio_hashes: List[str] = None, + audio_hashes: list[str] = None, cached_audio_features: torch.Tensor = None, ) -> torch.Tensor: @@ -676,7 +676,7 @@ def _process_image_input( def _process_video_input( self, video_input: Qwen2_5_VLVideoInputs, - video_hashes: List[str] = None, + video_hashes: list[str] = None, cached_video_embeds: torch.Tensor = None) -> torch.Tensor: if video_input["type"] == "video_embeds": return video_input["video_embeds"].type(self.visual.dtype) @@ -825,7 +825,7 @@ def get_multimodal_embeddings_v0( if audio_input is None and image_input is None and video_input is None: return None - multimodal_embeddings: List[Tuple[NestedTensors, str]] = [] + multimodal_embeddings: list[tuple[NestedTensors, str]] = [] if audio_input is not None: audio_embeds = self._process_audio_input(audio_input) @@ -891,8 +891,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=["talker.", "token2wav."], diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 8728de95134d..5904ad1f1f24 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -24,9 +24,9 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" +from collections.abc import Iterable, Mapping from functools import partial -from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Set, - Tuple, TypedDict, Union) +from typing import Callable, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -91,7 +91,7 @@ class Qwen2_5_VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all images' features. + - list[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -137,7 +137,7 @@ class Qwen2_5_VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all videos' features. + - list[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -709,8 +709,8 @@ def forward( hidden_states = hidden_states[reverse_indices, :] return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("attn.qkv.", "attn.q.", "q"), @@ -718,7 +718,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("attn.qkv.", "attn.v.", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -750,7 +750,7 @@ def get_hf_processor( min_pixels: Optional[int] = None, max_pixels: Optional[int] = None, size: Optional[dict[str, int]] = None, - fps: Optional[Union[float, List[float]]] = None, + fps: Optional[Union[float, list[float]]] = None, **kwargs: object, ) -> Qwen2_5_VLProcessor: if fps is not None: @@ -1116,8 +1116,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py index f30bf08ab18b..3182a7532578 100644 --- a/vllm/model_executor/models/qwen2_audio.py +++ b/vllm/model_executor/models/qwen2_audio.py @@ -22,7 +22,7 @@ # limitations under the License. """Inference-only Qwen2-Audio model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Optional, TypedDict, Union import torch import torch.nn as nn @@ -403,7 +403,7 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index ae1c146cf3f2..7cf98dc7a4ea 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -23,7 +23,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2MoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch import torch.nn.functional as F @@ -169,12 +170,12 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", - dual_chunk_attention_config: Optional[Dict[str, Any]] = None, + dual_chunk_attention_config: Optional[dict[str, Any]] = None, ) -> None: super().__init__() self.hidden_size = hidden_size @@ -389,8 +390,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -409,7 +410,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -532,8 +533,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["rotary_emb.inv_freq"]), diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py index 90f799e6734e..81dc38988c9d 100644 --- a/vllm/model_executor/models/qwen2_rm.py +++ b/vllm/model_executor/models/qwen2_rm.py @@ -5,7 +5,8 @@ # Copyright 2024 The Qwen team. # Copyright 2023 The vLLM team. """Inference-only Qwen2-RM model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -95,8 +96,8 @@ def pooler( ) -> Optional[PoolerOutput]: return self._pooler(hidden_states, pooling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["lm_head."]) return loader.load_weights(weights) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index ac0a6de523df..0ff0836b0897 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -25,8 +25,7 @@ """Inference-only Qwen2-VL model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping, Sequence from functools import partial -from typing import (Any, Callable, Literal, Optional, Set, Tuple, TypedDict, - Union) +from typing import Any, Callable, Literal, Optional, TypedDict, Union import torch import torch.nn as nn @@ -102,7 +101,7 @@ class Qwen2VLImageEmbeddingInputs(TypedDict): type: Literal["image_embeds"] image_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all images' features. + - list[`torch.Tensor`]: A list of tensors holding all images' features. Each tensor holds an image's features. - `torch.Tensor`: A tensor holding all images' features (concatenation of all images' feature tensors). @@ -142,7 +141,7 @@ class Qwen2VLVideoEmbeddingInputs(TypedDict): type: Literal["video_embeds"] video_embeds: torch.Tensor """Supported types: - - List[`torch.Tensor`]: A list of tensors holding all videos' features. + - list[`torch.Tensor`]: A list of tensors holding all videos' features. Each tensor holds an video's features. - `torch.Tensor`: A tensor holding all videos' features (concatenation of all videos' feature tensors). @@ -662,8 +661,8 @@ def forward( return x - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -671,7 +670,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: @@ -1394,8 +1393,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) diff --git a/vllm/model_executor/models/qwen3.py b/vllm/model_executor/models/qwen3.py index 40e0ccc1bab6..dbe2be8a73d5 100644 --- a/vllm/model_executor/models/qwen3.py +++ b/vllm/model_executor/models/qwen3.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3 model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -63,7 +64,7 @@ def __init__(self, rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, - rope_scaling: Optional[Tuple] = None, + rope_scaling: Optional[tuple] = None, prefix: str = "", attn_type: str = AttentionType.DECODER) -> None: super().__init__() @@ -201,7 +202,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -309,8 +310,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["lm_head."] diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 1fef37a96ea9..aae5401721df 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -21,7 +21,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen3MoE model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -149,7 +150,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, head_dim: Optional[int] = None, rms_norm_eps: float = 1e-06, @@ -373,8 +374,8 @@ def forward( hidden_states, _ = self.norm(hidden_states, residual) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -393,7 +394,7 @@ def load_weights(self, weights: Iterable[Tuple[str, num_experts=self.config.num_experts) params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). @@ -527,8 +528,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, skip_prefixes=(["rotary_emb.inv_freq"]), diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index 199b885a5850..3701153bace5 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -9,10 +9,9 @@ import math import re import unicodedata -from collections.abc import Collection, Mapping, Sequence -from collections.abc import Set as AbstractSet +from collections.abc import Collection, Mapping, Sequence, Set from functools import lru_cache, partial -from typing import Callable, List, Literal, Optional, TypedDict, Union +from typing import Callable, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -395,7 +394,7 @@ class TokenizerWithoutImagePad(tokenizer.__class__): # type: ignore def tokenize( self, text: str, - allowed_special: Union[AbstractSet[str], str] = "all", + allowed_special: Union[Set[str], str] = "all", disallowed_special: Union[Collection[str], str] = (), **kwargs, ) -> list[Union[bytes, str]]: @@ -411,7 +410,7 @@ def tokenize( def _decode( self, - token_ids: Union[int, List[int]], + token_ids: Union[int, list[int]], skip_special_tokens: bool = False, errors: Optional[str] = None, **kwargs, diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 06a0e6574630..c55f7ccd344f 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -10,10 +10,10 @@ import sys import tempfile from abc import ABC, abstractmethod +from collections.abc import Set from dataclasses import dataclass, field from functools import lru_cache -from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type, - TypeVar, Union) +from typing import Callable, Optional, TypeVar, Union import cloudpickle import torch.nn as nn @@ -266,7 +266,7 @@ class _ModelInfo: supports_v0_only: bool @staticmethod - def from_model_cls(model: Type[nn.Module]) -> "_ModelInfo": + def from_model_cls(model: type[nn.Module]) -> "_ModelInfo": return _ModelInfo( architecture=model.__name__, is_text_generation_model=is_text_generation_model(model), @@ -290,7 +290,7 @@ def inspect_model_cls(self) -> _ModelInfo: raise NotImplementedError @abstractmethod - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: raise NotImplementedError @@ -301,10 +301,10 @@ class _RegisteredModel(_BaseRegisteredModel): """ interfaces: _ModelInfo - model_cls: Type[nn.Module] + model_cls: type[nn.Module] @staticmethod - def from_model_cls(model_cls: Type[nn.Module]): + def from_model_cls(model_cls: type[nn.Module]): return _RegisteredModel( interfaces=_ModelInfo.from_model_cls(model_cls), model_cls=model_cls, @@ -313,7 +313,7 @@ def from_model_cls(model_cls: Type[nn.Module]): def inspect_model_cls(self) -> _ModelInfo: return self.interfaces - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: return self.model_cls @@ -330,7 +330,7 @@ def inspect_model_cls(self) -> _ModelInfo: return _run_in_subprocess( lambda: _ModelInfo.from_model_cls(self.load_model_cls())) - def load_model_cls(self) -> Type[nn.Module]: + def load_model_cls(self) -> type[nn.Module]: mod = importlib.import_module(self.module_name) return getattr(mod, self.class_name) @@ -339,7 +339,7 @@ def load_model_cls(self) -> Type[nn.Module]: def _try_load_model_cls( model_arch: str, model: _BaseRegisteredModel, -) -> Optional[Type[nn.Module]]: +) -> Optional[type[nn.Module]]: from vllm.platforms import current_platform current_platform.verify_model_arch(model_arch) try: @@ -366,15 +366,15 @@ def _try_inspect_model_cls( @dataclass class _ModelRegistry: # Keyed by model_arch - models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict) + models: dict[str, _BaseRegisteredModel] = field(default_factory=dict) - def get_supported_archs(self) -> AbstractSet[str]: + def get_supported_archs(self) -> Set[str]: return self.models.keys() def register_model( self, model_arch: str, - model_cls: Union[Type[nn.Module], str], + model_cls: Union[type[nn.Module], str], ) -> None: """ Register an external model to be used in vLLM. @@ -413,7 +413,7 @@ def register_model( self.models[model_arch] = model - def _raise_for_unsupported(self, architectures: List[str]): + def _raise_for_unsupported(self, architectures: list[str]): all_supported_archs = self.get_supported_archs() if any(arch in all_supported_archs for arch in architectures): @@ -426,7 +426,7 @@ def _raise_for_unsupported(self, architectures: List[str]): f"Supported architectures: {all_supported_archs}") def _try_load_model_cls(self, - model_arch: str) -> Optional[Type[nn.Module]]: + model_arch: str) -> Optional[type[nn.Module]]: if model_arch not in self.models: return None @@ -440,8 +440,8 @@ def _try_inspect_model_cls(self, model_arch: str) -> Optional[_ModelInfo]: def _normalize_archs( self, - architectures: Union[str, List[str]], - ) -> List[str]: + architectures: Union[str, list[str]], + ) -> list[str]: if isinstance(architectures, str): architectures = [architectures] if not architectures: @@ -458,8 +458,8 @@ def _normalize_archs( def inspect_model_cls( self, - architectures: Union[str, List[str]], - ) -> Tuple[_ModelInfo, str]: + architectures: Union[str, list[str]], + ) -> tuple[_ModelInfo, str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -471,8 +471,8 @@ def inspect_model_cls( def resolve_model_cls( self, - architectures: Union[str, List[str]], - ) -> Tuple[Type[nn.Module], str]: + architectures: Union[str, list[str]], + ) -> tuple[type[nn.Module], str]: architectures = self._normalize_archs(architectures) for arch in architectures: @@ -484,77 +484,77 @@ def resolve_model_cls( def is_text_generation_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_text_generation_model def is_pooling_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_pooling_model def is_cross_encoder_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_cross_encoding def is_multimodal_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_multimodal def is_pp_supported_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_pp def model_has_inner_state( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.has_inner_state def is_attention_free_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_attention_free def is_hybrid_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.is_hybrid def is_noops_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.has_noops def is_transcription_model( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return model_cls.supports_transcription def is_v1_compatible( self, - architectures: Union[str, List[str]], + architectures: Union[str, list[str]], ) -> bool: model_cls, _ = self.inspect_model_cls(architectures) return not model_cls.supports_v0_only diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py index ebefe7689c97..9a4d0ab2dd4d 100644 --- a/vllm/model_executor/models/roberta.py +++ b/vllm/model_executor/models/roberta.py @@ -1,7 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 import itertools -from typing import Iterable, Optional, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -135,7 +136,7 @@ def _build_model(self, prefix=prefix, embedding_class=RobertaEmbedding) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): weights = self.hf_to_vllm_mapper.apply(weights) # Separate weights in "roberta"-prefixed and all else (not in memory). # For use with models like FacebookAI/roberta-base. @@ -187,7 +188,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.classifier = RobertaClassificationHead(config) self._pooler = CrossEncodingPooler(config, self.classifier) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): bert_weights, task_weights = roberta_task_weights_filter(weights) bert_weights = self.jina_to_vllm_mapper.apply(bert_weights) @@ -249,8 +250,8 @@ def create_position_ids_from_input_ids(input_ids, def roberta_task_weights_filter( - all_weights: Iterable[Tuple[str, torch.Tensor]] -) -> Tuple[Iterable[Tuple[str, torch.Tensor]], Iterable[Tuple[str, + all_weights: Iterable[tuple[str, torch.Tensor]] +) -> tuple[Iterable[tuple[str, torch.Tensor]], Iterable[tuple[str, torch.Tensor]]]: """ Separate task-specific weights that are applied on top diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 75fcf540b0b1..3b5334afa7af 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -3,7 +3,8 @@ within a vision language model.""" import math -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -265,7 +266,7 @@ def __init__( def forward( self, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, None]: + ) -> tuple[torch.Tensor, None]: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) @@ -480,8 +481,8 @@ def forward( feature_sample_layers=feature_sample_layers, ) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -489,7 +490,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("qkv_proj", "v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py index e78c37b65f87..91f6c7753c68 100644 --- a/vllm/model_executor/models/skyworkr1v.py +++ b/vllm/model_executor/models/skyworkr1v.py @@ -8,7 +8,7 @@ # -------------------------------------------------------- from abc import ABC, abstractmethod from collections.abc import Iterable, Mapping, Sequence -from typing import Literal, Optional, Set, Tuple, TypedDict, TypeVar, Union +from typing import Literal, Optional, TypedDict, TypeVar, Union import torch import torch.nn as nn @@ -937,8 +937,8 @@ def compute_logits( return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: skip_prefixes = [ "action_embed", "temporal_embed", "track_embed", "track_embed_decoder", "box_token", "cg_criterion", "cg_model", diff --git a/vllm/model_executor/models/smolvlm.py b/vllm/model_executor/models/smolvlm.py index 17217dc9a247..31dec55026ba 100644 --- a/vllm/model_executor/models/smolvlm.py +++ b/vllm/model_executor/models/smolvlm.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 -from typing import Dict, Optional +from typing import Optional from transformers import SmolVLMProcessor @@ -21,7 +21,7 @@ class SmolVLMProcessingInfo(Idefics3ProcessingInfo): def get_hf_processor( self, *, - max_image_size: Optional[Dict[str, int]] = None, + max_image_size: Optional[dict[str, int]] = None, **kwargs: object, ) -> SmolVLMProcessor: if max_image_size is not None: diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index f86aff7ba7ef..1c9f3c77c7a8 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -23,7 +23,8 @@ # limitations under the License. """Inference-only Solar model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Any, Optional, Union import torch from torch import nn @@ -101,7 +102,7 @@ def __init__( num_heads: int, num_kv_heads: int, rope_theta: float = 10000, - rope_scaling: Optional[Dict[str, Any]] = None, + rope_scaling: Optional[dict[str, Any]] = None, max_position_embeddings: int = 8192, quant_config: Optional[QuantizationConfig] = None, bias: bool = False, @@ -236,7 +237,7 @@ def forward( positions: torch.Tensor, hidden_states: torch.Tensor, residual: Optional[torch.Tensor], - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention if residual is None: residual = hidden_states @@ -437,8 +438,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -448,7 +449,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".gate_up_proj", ".up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py index 1cbda7267e4c..8c2ad6f19251 100644 --- a/vllm/model_executor/models/stablelm.py +++ b/vllm/model_executor/models/stablelm.py @@ -20,7 +20,8 @@ # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json """Inference-only StabeLM (https://github.com/Stability-AI/StableLM) model compatible with HuggingFace weights.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -180,7 +181,7 @@ def forward( self, positions: torch.Tensor, hidden_states: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # Self Attention residual = hidden_states hidden_states = self.input_layernorm(hidden_states) @@ -252,8 +253,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -263,7 +264,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ("gate_up_proj", "up_proj", 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -335,8 +336,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # Models trained using ColossalAI may include these tensors in diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py index 6eebe4c4d614..5927afa91f49 100644 --- a/vllm/model_executor/models/starcoder2.py +++ b/vllm/model_executor/models/starcoder2.py @@ -19,7 +19,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch Starcoder2 model.""" -from typing import Iterable, Optional, Set, Tuple, Union +from collections.abc import Iterable +from typing import Optional, Union import torch from torch import nn @@ -255,8 +256,8 @@ def forward( hidden_states = self.norm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -265,7 +266,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters(remove_duplicate=False)) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: @@ -342,8 +343,8 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, # Models trained using ColossalAI may include these tensors in diff --git a/vllm/model_executor/models/telechat2.py b/vllm/model_executor/models/telechat2.py index 379e19e1beea..7d713d23c772 100644 --- a/vllm/model_executor/models/telechat2.py +++ b/vllm/model_executor/models/telechat2.py @@ -19,7 +19,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Iterable, Set, Tuple +from collections.abc import Iterable import torch import torch.nn as nn @@ -50,14 +50,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): layer.mlp.gate_up_proj.bias = None layer.mlp.gate_up_proj.skip_bias_add = True - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ ('gate_up_proj', 'gate_proj', 0), ('gate_up_proj', 'up_proj', 1), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() total_num_heads = self.config.n_head head_dim = self.config.hidden_size // total_num_heads for name, loaded_weight in weights: @@ -128,8 +128,8 @@ def _init_model(self, layer_type: type[nn.Module] = LlamaDecoderLayer): return TeleChat2Model(vllm_config=vllm_config, prefix=prefix) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader( self, diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 7b946ad6aac7..a8f30b2f27bf 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -15,7 +15,8 @@ # limitations under the License. """Wrapper around `transformers` models""" import re -from typing import Iterable, Literal, Optional, Union +from collections.abc import Iterable +from typing import Literal, Optional, Union import torch from torch import nn diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 0bc5d218f8d0..c1a4dc1b33d7 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -3,7 +3,7 @@ # Adapted from https://github.com/fixie-ai/ultravox/blob/ecd58c4041030bae2ad15aa6bcf04ab43199ea02/ultravox/model/ultravox_model.py """PyTorch Ultravox model.""" from collections.abc import Iterable, Mapping, Sequence -from typing import Any, Literal, Optional, Set, Tuple, TypedDict, Union +from typing import Any, Literal, Optional, TypedDict, Union import torch from torch import nn @@ -619,8 +619,8 @@ def compute_logits(self, hidden_states: torch.Tensor, return self.language_model.compute_logits(hidden_states, sampling_metadata) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, ignore_unexpected_prefixes=["audio_tower."]) diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 0458e3ce03b5..5cc501622891 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -1,9 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 import itertools +from collections.abc import Iterable, Mapping from dataclasses import dataclass, field -from typing import (Callable, Dict, Iterable, List, Literal, Mapping, Optional, - Protocol, Set, Tuple, Union, overload) +from typing import Callable, Literal, Optional, Protocol, Union, overload import torch import torch.nn as nn @@ -58,8 +58,8 @@ def _map_name(self, key: str) -> Optional[str]: return key def apply( - self, weights: Iterable[Tuple[str, torch.Tensor]] - ) -> Iterable[Tuple[str, torch.Tensor]]: + self, weights: Iterable[tuple[str, torch.Tensor]] + ) -> Iterable[tuple[str, torch.Tensor]]: return ((out_name, data) for name, data in weights if (out_name := self._map_name(name)) is not None) @@ -84,8 +84,8 @@ def __init__( self, module: nn.Module, *, - skip_prefixes: Optional[List[str]] = None, - ignore_unexpected_prefixes: Optional[List[str]] = None, + skip_prefixes: Optional[list[str]] = None, + ignore_unexpected_prefixes: Optional[list[str]] = None, ) -> None: super().__init__() @@ -95,8 +95,8 @@ def __init__( def _groupby_prefix( self, - weights: Iterable[Tuple[str, torch.Tensor]], - ) -> Iterable[Tuple[str, Iterable[Tuple[str, torch.Tensor]]]]: + weights: Iterable[tuple[str, torch.Tensor]], + ) -> Iterable[tuple[str, Iterable[tuple[str, torch.Tensor]]]]: weights_by_parts = ((weight_name.split(".", 1), weight_data) for weight_name, weight_data in weights) @@ -129,7 +129,7 @@ def _load_param( self, base_prefix: str, param: nn.Parameter, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], ) -> Iterable[str]: for weight_name, weight_data in weights: weight_qualname = self._get_qualname(base_prefix, weight_name) @@ -159,7 +159,7 @@ def _load_param( yield weight_qualname def _add_loadable_non_param_tensors(self, module: nn.Module, - child_params: Dict[str, torch.Tensor]): + child_params: dict[str, torch.Tensor]): """ Add tensor names that are not in the model params that may be in the safetensors, e.g., batch normalization stats. @@ -182,7 +182,7 @@ def _load_module( self, base_prefix: str, module: nn.Module, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], ) -> Iterable[str]: if isinstance(module, PPMissingLayer): return @@ -251,10 +251,10 @@ def _load_module( def load_weights( self, - weights: Iterable[Tuple[str, torch.Tensor]], + weights: Iterable[tuple[str, torch.Tensor]], *, mapper: Optional[WeightsMapper] = None, - ) -> Set[str]: + ) -> set[str]: if mapper is not None: weights = mapper.apply(weights) @@ -292,13 +292,13 @@ def flatten_bn(x: torch.Tensor) -> torch.Tensor: @overload -def flatten_bn(x: List[torch.Tensor]) -> List[torch.Tensor]: +def flatten_bn(x: list[torch.Tensor]) -> list[torch.Tensor]: ... @overload def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: Literal[True], ) -> torch.Tensor: @@ -307,18 +307,18 @@ def flatten_bn( @overload def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[List[torch.Tensor], torch.Tensor]: +) -> Union[list[torch.Tensor], torch.Tensor]: ... def flatten_bn( - x: Union[List[torch.Tensor], torch.Tensor], + x: Union[list[torch.Tensor], torch.Tensor], *, concat: bool = False, -) -> Union[List[torch.Tensor], torch.Tensor]: +) -> Union[list[torch.Tensor], torch.Tensor]: """ Flatten the ``B`` and ``N`` dimensions of batched multimodal inputs. @@ -442,7 +442,7 @@ def merge_multimodal_embeddings( input_ids: torch.Tensor, inputs_embeds: torch.Tensor, multimodal_embeddings: NestedTensors, - placeholder_token_id: Union[int, List[int]], + placeholder_token_id: Union[int, list[int]], ) -> torch.Tensor: """ Merge ``multimodal_embeddings`` into ``inputs_embeds`` by overwriting the @@ -596,7 +596,7 @@ def make_layers( num_hidden_layers: int, layer_fn: LayerFn, prefix: str, -) -> Tuple[int, int, torch.nn.ModuleList]: +) -> tuple[int, int, torch.nn.ModuleList]: """Make a list of layers with the given layer function, taking pipeline parallelism into account. """ @@ -614,10 +614,10 @@ def make_layers( # NOTE: don't use lru_cache here because it can prevent garbage collection -_model_to_pp_missing_layer_names: Dict[int, List[str]] = {} +_model_to_pp_missing_layer_names: dict[int, list[str]] = {} -def get_pp_missing_layer_names(model: torch.nn.Module) -> List[str]: +def get_pp_missing_layer_names(model: torch.nn.Module) -> list[str]: """Get the names of the missing layers in a pipeline parallel model.""" model_id = id(model) if model_id in _model_to_pp_missing_layer_names: @@ -645,7 +645,7 @@ def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool: for missing_layer_name in get_pp_missing_layer_names(model)) -def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int): +def make_empty_intermediate_tensors_factory(keys: list[str], hidden_size: int): def make_empty_intermediate_tensors( batch_size: int, @@ -684,7 +684,7 @@ def extract_layer_index(layer_name: str) -> int: - "model.encoder.layers.0.sub.1" -> ValueError """ subnames = layer_name.split(".") - int_vals: List[int] = [] + int_vals: list[int] = [] for subname in subnames: try: int_vals.append(int(subname)) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index 908cd7885aa8..c6e303d6024a 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -2,7 +2,7 @@ import math from collections.abc import Iterable, Mapping, Sequence -from typing import List, Optional, Set, Tuple, TypedDict, Union +from typing import Optional, TypedDict, Union import torch from torch import nn @@ -382,7 +382,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.embed_positions.weight.copy_( sinusoids(*self.embed_positions.weight.shape)) - def forward(self, input_features: Union[torch.Tensor, List[torch.Tensor]]): + def forward(self, input_features: Union[torch.Tensor, list[torch.Tensor]]): hidden_states = [] for features in input_features: embeds = nn.functional.gelu(self.conv1(features)) @@ -460,7 +460,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): def forward( self, - input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], input_ids: Optional[torch.Tensor], positions: torch.Tensor, ) -> torch.Tensor: @@ -474,14 +474,14 @@ def forward( def get_encoder_outputs( self, - input_features: Optional[Union[torch.Tensor, List[torch.Tensor]]], + input_features: Optional[Union[torch.Tensor, list[torch.Tensor]]], ) -> Optional[torch.Tensor]: if input_features is None: return None return self.encoder(input_features) - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), @@ -491,7 +491,7 @@ def load_weights(self, weights: Iterable[Tuple[str, (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: @@ -722,8 +722,8 @@ def compute_logits(self, hidden_states: torch.Tensor, sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=["proj_out."]) # add fake zeros bias for k_proj to state_dict @@ -732,8 +732,8 @@ def load_weights(self, weights: Iterable[Tuple[str, def _create_fake_bias_for_k_proj( - weights: Iterable[Tuple[str, torch.Tensor]] -) -> Iterable[Tuple[str, torch.Tensor]]: + weights: Iterable[tuple[str, torch.Tensor]] +) -> Iterable[tuple[str, torch.Tensor]]: """ Create full zeros bias for k_proj weight in self-attn and x-attn layers. So that the bias for k_proj in qkv_proj can be initialized with zeros. diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index eddccbba5a2d..48e254bdd85b 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -6,8 +6,9 @@ architectures in a hybrid model optimized for efficient sequence modeling. The model alternates between state space model layers and attention-based layers. """ +from collections.abc import Iterable from itertools import cycle -from typing import Dict, Iterable, List, Optional, Set, Tuple, Union +from typing import Optional, Union import torch from torch import nn @@ -54,7 +55,7 @@ def __init__( self, input_dim: int, rank: int, - output_dim: Union[int, List[int]], + output_dim: Union[int, list[int]], quant_config: Optional[QuantizationConfig] = None, ): """Initialize the attention layer. @@ -279,7 +280,7 @@ def __init__( self, config: Zamba2Config, bare_block_idx: int, - num_hybrid_layers: Dict[int, int], + num_hybrid_layers: dict[int, int], quant_config: Optional[QuantizationConfig] = None, ) -> None: """Initialize the MLP layer. @@ -769,8 +770,8 @@ def forward( hidden_states = self.final_layernorm(hidden_states) return hidden_states - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -779,7 +780,7 @@ def load_weights(self, weights: Iterable[Tuple[str, ] params_dict = dict(self.named_parameters()) - loaded_params: Set[str] = set() + loaded_params: set[str] = set() for chkpt_weight_name, loaded_weight in weights: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in chkpt_weight_name: @@ -914,9 +915,9 @@ def forward(self, return hidden_states - def copy_inputs_before_cuda_graphs(self, input_buffers: Dict[str, + def copy_inputs_before_cuda_graphs(self, input_buffers: dict[str, torch.Tensor], - **kwargs) -> Dict[str, torch.Tensor]: + **kwargs) -> dict[str, torch.Tensor]: """Copy inputs before CUDA graph capture. Args: @@ -930,7 +931,7 @@ def copy_inputs_before_cuda_graphs(self, input_buffers: Dict[str, input_buffers, **kwargs) def get_seqlen_agnostic_capture_inputs( - self, batch_size: int) -> Dict[str, torch.Tensor]: + self, batch_size: int) -> dict[str, torch.Tensor]: """Get inputs for sequence-length-agnostic graph capture. Args: @@ -941,7 +942,7 @@ def get_seqlen_agnostic_capture_inputs( return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size) def _get_mamba_cache_shape( - self) -> Tuple[Tuple[int, int], Tuple[int, int]]: + self) -> tuple[tuple[int, int], tuple[int, int]]: """Calculate shapes for Mamba's convolutional and state caches. Returns: @@ -1001,7 +1002,7 @@ def compute_logits( sampling_metadata) return logits - def load_weights(self, weights: Iterable[Tuple[str, - torch.Tensor]]) -> Set[str]: + def load_weights(self, weights: Iterable[tuple[str, + torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) From e6b8e65d2d68fc96871bc2f07999cb495e054ced Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 15 May 2025 07:26:34 +0200 Subject: [PATCH 22/58] [Bugfix] Fix fp8 tests for triton_unified_attention for Triton 3.3 (#18013) Signed-off-by: Thomas Parnell Co-authored-by: Lucas Wilkinson --- .../kernels/{ => attention}/test_triton_unified_attention.py | 3 +++ vllm/attention/ops/triton_unified_attention.py | 4 ++++ 2 files changed, 7 insertions(+) rename tests/kernels/{ => attention}/test_triton_unified_attention.py (98%) diff --git a/tests/kernels/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py similarity index 98% rename from tests/kernels/test_triton_unified_attention.py rename to tests/kernels/attention/test_triton_unified_attention.py index 50da8e5fd5cd..4e15d00255a4 100644 --- a/tests/kernels/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -99,6 +99,9 @@ def test_triton_unified_attn( ) -> None: torch.set_default_device("cuda") + if q_dtype is not None and q_dtype.itemsize < 2 and block_size < 32: + pytest.skip("block size must be at least 32 for fp8") + current_platform.seed_everything(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index 8c0cf9267f35..f08000a75bc7 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -268,6 +268,10 @@ def unified_attention( assert causal, "Only causal attention is supported" assert q_descale is None, "Q scales not supported" + block_size = v.shape[1] + assert q.element_size() >= 2 or block_size >= 32, \ + "Block size must be at least 32 for fp8" + use_alibi_slopes = alibi_slopes is not None block_size = v.shape[1] From 4f07a640759283a09862e1ad74e390b469891918 Mon Sep 17 00:00:00 2001 From: Chenheli Hua Date: Wed, 14 May 2025 22:26:49 -0700 Subject: [PATCH 23/58] Support custom implementations of VideoLoader backends. (#18091) --- tests/multimodal/test_video.py | 41 ++++++++++++++++++++++++++++++++++ vllm/envs.py | 11 +++++++++ vllm/multimodal/video.py | 33 +++++++++++++++++++++++++-- 3 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 tests/multimodal/test_video.py diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py new file mode 100644 index 000000000000..e67624ecefcb --- /dev/null +++ b/tests/multimodal/test_video.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: Apache-2.0 +import numpy as np +import numpy.typing as npt +import pytest + +from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader + +NUM_FRAMES = 10 +FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3) +FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3) + + +@VIDEO_LOADER_REGISTRY.register("test_video_loader_1") +class TestVideoLoader1(VideoLoader): + + @classmethod + def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray: + return FAKE_OUTPUT_1 + + +@VIDEO_LOADER_REGISTRY.register("test_video_loader_2") +class TestVideoLoader2(VideoLoader): + + @classmethod + def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray: + return FAKE_OUTPUT_2 + + +def test_video_loader_registry(): + custom_loader_1 = VIDEO_LOADER_REGISTRY.load("test_video_loader_1") + output_1 = custom_loader_1.load_bytes(b"test") + np.testing.assert_array_equal(output_1, FAKE_OUTPUT_1) + + custom_loader_2 = VIDEO_LOADER_REGISTRY.load("test_video_loader_2") + output_2 = custom_loader_2.load_bytes(b"test") + np.testing.assert_array_equal(output_2, FAKE_OUTPUT_2) + + +def test_video_loader_type_doesnt_exist(): + with pytest.raises(AssertionError): + VIDEO_LOADER_REGISTRY.load("non_existing_video_loader") diff --git a/vllm/envs.py b/vllm/envs.py index 9d585bf3578e..fe3fa91fbe33 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -55,6 +55,7 @@ VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_VIDEO_FETCH_TIMEOUT: int = 30 VLLM_AUDIO_FETCH_TIMEOUT: int = 10 + VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MM_INPUT_CACHE_GIB: int = 8 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None @@ -446,6 +447,16 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: "VLLM_AUDIO_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")), + # Backend for Video IO + # - "opencv": Default backend that uses OpenCV stream buffered backend. + # + # Custom backend implementations can be registered + # via `@VIDEO_LOADER_REGISTRY.register("my_custom_video_loader")` and + # imported at runtime. + # If a non-existing backend is used, an AssertionError will be thrown. + "VLLM_VIDEO_LOADER_BACKEND": + lambda: os.getenv("VLLM_VIDEO_LOADER_BACKEND", "opencv"), + # Cache size (in GiB) for multimodal input cache # Default is 4 GiB "VLLM_MM_INPUT_CACHE_GIB": diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 72e9b65d763c..3685fd4c3458 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 import base64 +from abc import abstractmethod from functools import partial from io import BytesIO from pathlib import Path @@ -9,6 +10,8 @@ import numpy.typing as npt from PIL import Image +from vllm import envs + from .base import MediaIO from .image import ImageMediaIO @@ -48,10 +51,35 @@ def sample_frames_from_video(frames: npt.NDArray, class VideoLoader: @classmethod - def load_bytes(self, data: bytes, num_frames: int = -1) -> npt.NDArray: + @abstractmethod + def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray: raise NotImplementedError +class VideoLoaderRegistry: + + def __init__(self) -> None: + self.name2class: dict[str, type] = {} + + def register(self, name: str): + + def wrap(cls_to_register): + self.name2class[name] = cls_to_register + return cls_to_register + + return wrap + + @staticmethod + def load(cls_name: str) -> VideoLoader: + cls = VIDEO_LOADER_REGISTRY.name2class.get(cls_name) + assert cls is not None, f"VideoLoader class {cls_name} not found" + return cls() + + +VIDEO_LOADER_REGISTRY = VideoLoaderRegistry() + + +@VIDEO_LOADER_REGISTRY.register("opencv") class OpenCVVideoBackend(VideoLoader): def get_cv2_video_api(self): @@ -122,7 +150,8 @@ def __init__( self.image_io = image_io self.num_frames = num_frames - self.video_loader = OpenCVVideoBackend + video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND + self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend) def load_bytes(self, data: bytes) -> npt.NDArray: return self.video_loader.load_bytes(data, self.num_frames) From 420caf7557f85635fae09b3f2c27e38ac751551f Mon Sep 17 00:00:00 2001 From: Ning Xie Date: Thu, 15 May 2025 13:28:11 +0800 Subject: [PATCH 24/58] [UT] Add ut for none hash (#17892) Signed-off-by: Andy Xie --- tests/v1/core/test_kv_cache_utils.py | 41 +++++++++++++++++++++------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index e572100fe7a1..43a27da2dbe4 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +import importlib import pytest import torch @@ -10,8 +11,7 @@ from vllm.v1.core.kv_cache_manager import KVCacheManager # disable yapf here as it formats differently than isort such that both fail # yapf: disable -from vllm.v1.core.kv_cache_utils import (NONE_HASH, BlockHashType, - FreeKVCacheBlockQueue, KVCacheBlock, +from vllm.v1.core.kv_cache_utils import (FreeKVCacheBlockQueue, KVCacheBlock, PrefixCachingMetrics, estimate_max_model_len, generate_block_hash_extra_keys, @@ -65,13 +65,29 @@ def new_kv_cache_spec(block_size=16, sliding_window=sliding_window) -def test_none_hash(): - assert NONE_HASH is not None - assert isinstance(NONE_HASH, int) - assert NONE_HASH != 0 +def test_none_hash(monkeypatch): + import vllm.v1.core.kv_cache_utils + + # case 1: PYTHONHASHSEED is not set, use random + with monkeypatch.context() as m: + m.delenv('PYTHONHASHSEED', raising=False) + reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils) + assert reloaded_kv_cache_utils.NONE_HASH is not None + assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int) + assert reloaded_kv_cache_utils.NONE_HASH != 0 + + # case 2: PYTHONHASHSEED is set, use the seed + with monkeypatch.context() as m: + m.setenv('PYTHONHASHSEED', 'python hash seed') + reloaded_kv_cache_utils = importlib.reload(vllm.v1.core.kv_cache_utils) + assert reloaded_kv_cache_utils.NONE_HASH is not None + assert isinstance(reloaded_kv_cache_utils.NONE_HASH, int) + assert sha256('python hash seed') == reloaded_kv_cache_utils.NONE_HASH def test_kv_cache_block(): + import vllm.v1.core.kv_cache_utils + # Test KVCacheBlock initialization block = KVCacheBlock(block_id=0) assert block.block_id == 0 @@ -85,7 +101,8 @@ def test_kv_cache_block(): assert block.ref_cnt == 0 # Test block hash setting and resetting - block_hash = BlockHashType(hash_value=123, token_ids=(1, 2, 3)) + block_hash = vllm.v1.core.kv_cache_utils.BlockHashType(hash_value=123, + token_ids=(1, 2, 3)) block.block_hash = block_hash assert block.block_hash == block_hash @@ -259,13 +276,14 @@ def test_generate_block_hash_extra_keys_cache_salt(): @pytest.mark.parametrize("hash_fn", [sha256, hash]) def test_hash_block_tokens(hash_fn): + import vllm.v1.core.kv_cache_utils parent_block_hash = 123 curr_block_token_ids = (1, 2, 3) extra_keys = ("key1", "key2") block_hash = hash_block_tokens(hash_fn, parent_block_hash, curr_block_token_ids, extra_keys) - assert isinstance(block_hash, BlockHashType) + assert isinstance(block_hash, vllm.v1.core.kv_cache_utils.BlockHashType) assert block_hash.hash_value == hash_fn( (parent_block_hash, curr_block_token_ids, extra_keys)) assert block_hash.token_ids == curr_block_token_ids @@ -274,6 +292,7 @@ def test_hash_block_tokens(hash_fn): @pytest.mark.parametrize("hash_fn", [sha256, hash]) def test_hash_request_tokens(hash_fn): + import vllm.v1.core.kv_cache_utils request = make_request( request_id=0, prompt_token_ids=[_ for _ in range(6)], @@ -288,8 +307,10 @@ def test_hash_request_tokens(hash_fn): block_hashes = hash_request_tokens(hash_fn, block_size, request) assert len(block_hashes) == 2 - assert isinstance(block_hashes[0], BlockHashType) - assert isinstance(block_hashes[1], BlockHashType) + assert isinstance(block_hashes[0], + vllm.v1.core.kv_cache_utils.BlockHashType) + assert isinstance(block_hashes[1], + vllm.v1.core.kv_cache_utils.BlockHashType) # Check the first block assert block_hashes[0].token_ids == (0, 1, 2) From dd2a94596abe43eb0e556cde306fc80d442183c0 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Thu, 15 May 2025 13:29:38 +0800 Subject: [PATCH 25/58] [Model] Allow the use of sliding window in Qwen2 (#17772) Signed-off-by: inkcherry --- vllm/model_executor/models/qwen2.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 108d002e601b..0d0d98c59dbc 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -291,14 +291,14 @@ def __init__(self, # TODO (@robertgshaw2): see if this can be moved out if (cache_config.sliding_window is not None and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) + assert config.max_window_layers == config.num_hidden_layers, ( + "Sliding window for some but all layers is not supported. " + "This model uses sliding window but `max_window_layers` = {} " + "is less than `num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( + config.max_window_layers, + config.num_hidden_layers, + )) self.config = config self.quant_config = quant_config From 70f8b967242633f4cf38a456127f102664d1da3e Mon Sep 17 00:00:00 2001 From: Mengqing Cao Date: Thu, 15 May 2025 14:16:31 +0800 Subject: [PATCH 26/58] [Bugfix] Fix FusedMoEPrepareAndFinalize for cuda-disalike backends (#18178) Signed-off-by: Mengqing Cao --- vllm/model_executor/layers/fused_moe/layer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index d083e0040c0e..0b3c02d1ba28 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -43,6 +43,7 @@ from .pplx_prepare_finalize import PplxPrepareAndFinalize else: fused_experts = None # type: ignore + FusedMoEPrepareAndFinalize = None # type: ignore if is_rocm_aiter_moe_enabled(): from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( # noqa: E501 rocm_aiter_biased_group_topk as grouped_topk) From de71fec81b0082950ecea1e8db39125895ce44cc Mon Sep 17 00:00:00 2001 From: David Xia Date: Thu, 15 May 2025 02:17:16 -0400 Subject: [PATCH 27/58] [CI] don't skip fixed `test_kv_cache_events()` (#18183) Signed-off-by: David Xia --- tests/v1/engine/test_engine_core_client.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index 71ebd0a36e46..8bea032f656f 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -256,8 +256,6 @@ async def test_engine_core_client_asyncio(monkeypatch: pytest.MonkeyPatch): client.shutdown() -# TRACKING: https://github.com/vllm-project/vllm/issues/18167 -@pytest.mark.skip(reason="RE-ENABLE: this test is failing on main.") @pytest.mark.parametrize( "multiprocessing_mode,publisher_config", [(True, "tcp"), (False, "inproc")], From a8f5aec20ad685851f972847c0567db270d9845f Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Thu, 15 May 2025 02:17:57 -0400 Subject: [PATCH 28/58] [V1] Update zmq socket creation in nixl connector (#18148) Signed-off-by: Russell Bryant --- tests/test_utils.py | 7 +++++- .../kv_connector/v1/nixl_connector.py | 24 ++++++++----------- vllm/utils.py | 18 ++++++++++++++ 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index deff33e5c3ca..ea7db0a79c86 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -17,7 +17,7 @@ from vllm.utils import (CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot, PlaceholderModule, StoreBoolean, bind_kv_cache, deprecate_kwargs, get_open_port, - make_zmq_socket, memory_profiling, + make_zmq_path, make_zmq_socket, memory_profiling, merge_async_iterators, sha256, split_zmq_path, supports_kw, swap_dict_values) @@ -714,3 +714,8 @@ def test_make_zmq_socket_ipv6(): # Clean up zsock.close() ctx.term() + + +def test_make_zmq_path(): + assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555" + assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555" diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index abd1ea2bea82..c0c03efcdbf4 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -21,7 +21,7 @@ get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, get_tp_group) from vllm.logger import init_logger -from vllm.utils import round_down +from vllm.utils import make_zmq_path, make_zmq_socket, round_down from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.request import RequestStatus @@ -379,7 +379,7 @@ def _nixl_handshake_listener(metadata: NixlAgentMetadata, # hack to keeps us moving. We will switch when moving to etcd # or where we have a single ZMQ socket in the scheduler. port = envs.VLLM_NIXL_SIDE_CHANNEL_PORT + rank - path = f"tcp://{host}:{port}" + path = make_zmq_path("tcp", host, port) logger.debug("Starting listening on path: %s", path) with zmq_ctx(zmq.ROUTER, path) as sock: ready_event.set() @@ -397,7 +397,7 @@ def _nixl_handshake(self, host: str, port: int): # NOTE(rob): we need each rank to have a unique port. This is # a hack to keep us moving. We will switch when moving to etcd # or where we have a single ZMQ socket in the scheduler. - path = f"tcp://{host}:{port + self.rank}" + path = make_zmq_path("tcp", host, port + self.rank) logger.debug("Querying metadata on path: %s", path) with zmq_ctx(zmq.REQ, path) as sock: # Send query for the request. @@ -741,20 +741,16 @@ def _get_block_descs_ids(self, engine_id: str, def zmq_ctx(socket_type: Any, addr: str) -> Iterator[zmq.Socket]: """Context manager for a ZMQ socket""" + if socket_type not in (zmq.ROUTER, zmq.REQ): + raise ValueError(f"Unexpected socket type: {socket_type}") + ctx: Optional[zmq.Context] = None try: ctx = zmq.Context() # type: ignore[attr-defined] - - if socket_type == zmq.ROUTER: - socket = ctx.socket(zmq.ROUTER) - socket.bind(addr) - elif socket_type == zmq.REQ: - socket = ctx.socket(zmq.REQ) - socket.connect(addr) - else: - raise ValueError(f"Unexpected socket type: {socket_type}") - - yield socket + yield make_zmq_socket(ctx=ctx, + path=addr, + socket_type=socket_type, + bind=socket_type == zmq.ROUTER) finally: if ctx is not None: ctx.destroy(linger=0) diff --git a/vllm/utils.py b/vllm/utils.py index 9a7da8067ba4..edfbb8c9481e 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -2350,6 +2350,24 @@ def split_zmq_path(path: str) -> Tuple[str, str, str]: return scheme, host, port +def make_zmq_path(scheme: str, host: str, port: Optional[int] = None) -> str: + """Make a ZMQ path from its parts. + + Args: + scheme: The ZMQ transport scheme (e.g. tcp, ipc, inproc). + host: The host - can be an IPv4 address, IPv6 address, or hostname. + port: Optional port number, only used for TCP sockets. + + Returns: + A properly formatted ZMQ path string. + """ + if not port: + return f"{scheme}://{host}" + if is_valid_ipv6_address(host): + return f"{scheme}://[{host}]:{port}" + return f"{scheme}://{host}:{port}" + + # Adapted from: https://github.com/sgl-project/sglang/blob/v0.4.1/python/sglang/srt/utils.py#L783 # noqa: E501 def make_zmq_socket( ctx: Union[zmq.asyncio.Context, zmq.Context], # type: ignore[name-defined] From a9944aabfa0eb0f133cf869b3ed5defb44ed7d33 Mon Sep 17 00:00:00 2001 From: omahs <73983677+omahs@users.noreply.github.com> Date: Thu, 15 May 2025 11:16:15 +0200 Subject: [PATCH 29/58] fix: typos (#18151) Signed-off-by: omahs <73983677+omahs@users.noreply.github.com> --- csrc/attention/attention_kernels.cuh | 4 ++-- examples/offline_inference/chat_with_tools.py | 4 ++-- tests/lora/test_lora_huggingface.py | 2 +- tests/model_executor/weight_utils.py | 6 +++--- vllm/config.py | 2 +- vllm/lora/ops/triton_ops/lora_expand_op.py | 2 +- vllm/model_executor/layers/mamba/mamba_mixer2.py | 2 +- vllm/model_executor/models/granite_speech.py | 4 ++-- vllm/model_executor/models/phi4mm_audio.py | 8 ++++---- vllm/v1/request.py | 2 +- 10 files changed, 18 insertions(+), 18 deletions(-) diff --git a/csrc/attention/attention_kernels.cuh b/csrc/attention/attention_kernels.cuh index eb216dc8baf1..79a546554fa1 100644 --- a/csrc/attention/attention_kernels.cuh +++ b/csrc/attention/attention_kernels.cuh @@ -172,7 +172,7 @@ __device__ void paged_attention_kernel( // Load the query to registers. // Each thread in a thread group has a different part of the query. - // For example, if the the thread group size is 4, then the first thread in + // For example, if the thread group size is 4, then the first thread in // the group has 0, 4, 8, ... th vectors of the query, and the second thread // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because // q is split from a qkv tensor, it may not be contiguous. @@ -259,7 +259,7 @@ __device__ void paged_attention_kernel( // Load a key to registers. // Each thread in a thread group has a different part of the key. - // For example, if the the thread group size is 4, then the first thread in + // For example, if the thread group size is 4, then the first thread in // the group has 0, 4, 8, ... th vectors of the key, and the second thread // has 1, 5, 9, ... th vectors of the key, and so on. for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) { diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py index 15519bfed9cb..b532bf42adfb 100644 --- a/examples/offline_inference/chat_with_tools.py +++ b/examples/offline_inference/chat_with_tools.py @@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'): "partly cloudly, with highs in the 90's.") -tool_funtions = {"get_current_weather": get_current_weather} +tool_functions = {"get_current_weather": get_current_weather} tools = [{ "type": "function", @@ -122,7 +122,7 @@ def get_current_weather(city: str, state: str, unit: 'str'): # above defined function tool_calls = json.loads(output) tool_answers = [ - tool_funtions[call['name']](**call['arguments']) for call in tool_calls + tool_functions[call['name']](**call['arguments']) for call in tool_calls ] # append the answer as a tool message and let the LLM give you an answer diff --git a/tests/lora/test_lora_huggingface.py b/tests/lora/test_lora_huggingface.py index 0875128c4ff1..90498c47fb10 100644 --- a/tests/lora/test_lora_huggingface.py +++ b/tests/lora/test_lora_huggingface.py @@ -30,7 +30,7 @@ def test_load_checkpoints_from_huggingface(lora_fixture_name, request): lora_path = get_adapter_absolute_path(lora_name) - # lora loading should work for either absolute path and hugggingface id. + # lora loading should work for either absolute path and huggingface id. peft_helper = PEFTHelper.from_local_dir(lora_path, 4096) lora_model = LoRAModel.from_local_checkpoint( lora_path, diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py index 11dfe4d4995d..bdaba22c3c7a 100644 --- a/tests/model_executor/weight_utils.py +++ b/tests/model_executor/weight_utils.py @@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation(): try: # enable hf hub transfer if available import hf_transfer # type: ignore # noqa - HF_TRANFER_ACTIVE = True + HF_TRANSFER_ACTIVE = True except ImportError: - HF_TRANFER_ACTIVE = False + HF_TRANSFER_ACTIVE = False assert (huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == - HF_TRANFER_ACTIVE) + HF_TRANSFER_ACTIVE) def test_download_weights_from_hf(): diff --git a/vllm/config.py b/vllm/config.py index 81cac4d04116..19de4d0549b6 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -297,7 +297,7 @@ class ModelConfig: - 1K -> 1024\n - 25.6k -> 25,600""" spec_target_max_model_len: Optional[int] = None - """Specify the the maximum length for spec decoding draft models.""" + """Specify the maximum length for spec decoding draft models.""" quantization: Optional[QuantizationMethods] = None """Method used to quantize the weights. If `None`, we first check the `quantization_config` attribute in the model config file. If that is diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py index 13ddaaf961f7..9feb9e462459 100644 --- a/vllm/lora/ops/triton_ops/lora_expand_op.py +++ b/vllm/lora/ops/triton_ops/lora_expand_op.py @@ -153,7 +153,7 @@ def _lora_expand( lora_token_start_loc (torch.Tensor): A cumulative sum of num_tokens_per_lora. lora_token_start_loc[0] is always 0 so that lora_token_start_loc[i], along with num_tokens_per_lora[i] - identifies the the region in token_indices_sorted_by_lora_ids that + identifies the region in token_indices_sorted_by_lora_ids that LoRA lora_ids[i] should process. lora_ids (torch.Tensor): LoRA ids to process. no_lora_flag_cpu (torch.Tensor): A CPU tensor of size 1, that indicates diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py index 1ea65e96d750..bc6e6fcdd0a2 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer2.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py @@ -142,7 +142,7 @@ def mamba_v2_sharded_weight_loader( ) -> LoaderFunction: """Create a weight loader for mamba v2. This ensures that the projections are correctly sharded so that they can be split into x, B, C. It also - ensures the the all the groups corresponding to a head shard is placed + ensures that all the groups corresponding to a head shard is placed together with it. """ diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index 512ec55177d8..fd8fb48c50e3 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -21,7 +21,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""Inference-only IBM Granite speeech model.""" +"""Inference-only IBM Granite speech model.""" import math from collections.abc import Iterable, Mapping from typing import Optional, TypedDict, Union @@ -626,7 +626,7 @@ def _build_input_features_mask( audio_embed_sizes: torch.Tensor, ) -> torch.Tensor: """Calculate the input features mask, which will generally be used - to mask the the padded features for all entries in the batch except + to mask the padded features for all entries in the batch except for those with the most audio features. Args: diff --git a/vllm/model_executor/models/phi4mm_audio.py b/vllm/model_executor/models/phi4mm_audio.py index 609746b48588..98cef75069ae 100644 --- a/vllm/model_executor/models/phi4mm_audio.py +++ b/vllm/model_executor/models/phi4mm_audio.py @@ -91,9 +91,9 @@ class ConformerEncoderLayer(nn.Module): if set to True, use GLULinear module, otherwise, used GLUPointWiseConv module. default to False. - attention_innner_dim: int, optional + attention_inner_dim: int, optional if equal to -1, attention dim for linears k/q/v is - equal to d_model. otherwise attention_innner_dim is used. + equal to d_model. otherwise attention_inner_dim is used. default -1. attention_glu_type: str, optional activation function for glu used in the multihead attention, @@ -148,7 +148,7 @@ def __init__( conv_glu_type="sigmoid", bias_in_glu=True, linear_glu_in_convm=False, - attention_innner_dim=-1, + attention_inner_dim=-1, attention_glu_type="swish", activation_checkpointing="", export=False, @@ -169,7 +169,7 @@ def __init__( n_head, d_model, dropout_rate, - attention_innner_dim, + attention_inner_dim, attention_glu_type, bias_in_glu, use_pt_scaled_dot_product_attention= diff --git a/vllm/v1/request.py b/vllm/v1/request.py index d2843b65ab59..d1cdd2c52750 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -72,7 +72,7 @@ def __init__( assert len(self.mm_inputs) == len(self.mm_hashes) # Read-only views - # Prevent directly appending to the these lists since + # Prevent directly appending to these lists since # they should also be updated simultaneously. self.output_token_ids = ConstantList(self._output_token_ids) self.all_token_ids = ConstantList(self._all_token_ids) From 07ad27121f7a24ce37f7f49d9d936936844bd058 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 15 May 2025 12:00:21 +0100 Subject: [PATCH 30/58] Update deprecated type hinting in `model_loader` (#18130) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- pyproject.toml | 4 +- .../model_loader/bitsandbytes_loader.py | 25 +++++----- .../model_loader/default_loader.py | 11 +++-- .../model_loader/gguf_loader.py | 6 +-- vllm/model_executor/model_loader/neuron.py | 10 ++-- .../model_loader/neuronx_distributed.py | 6 +-- .../model_loader/runai_streamer_loader.py | 7 +-- .../model_loader/sharded_state_loader.py | 13 ++--- .../model_executor/model_loader/tensorizer.py | 7 +-- .../model_loader/tensorizer_loader.py | 4 +- vllm/model_executor/model_loader/utils.py | 14 +++--- .../model_loader/weight_utils.py | 47 ++++++++++--------- 12 files changed, 80 insertions(+), 74 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9465f1e8f059..0b803a26b658 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,15 +71,15 @@ exclude = [ "vllm/third_party/**" = ["ALL"] "vllm/version.py" = ["F401"] "vllm/_version.py" = ["ALL"] -# Python 3.8 typing. TODO: Remove these excludes after v1.0.0 +# Python 3.8 typing - skip V0 code "vllm/attention/**/*.py" = ["UP006", "UP035"] "vllm/core/**/*.py" = ["UP006", "UP035"] "vllm/engine/**/*.py" = ["UP006", "UP035"] "vllm/executor/**/*.py" = ["UP006", "UP035"] -"vllm/model_executor/model_loader/**/*.py" = ["UP006", "UP035"] "vllm/prompt_adapter/**/*.py" = ["UP006", "UP035"] "vllm/spec_decode/**/*.py" = ["UP006", "UP035"] "vllm/worker/**/*.py" = ["UP006", "UP035"] +# Python 3.8 typing - skip utils for ROCm "vllm/utils.py" = ["UP006", "UP035"] [tool.ruff.lint] diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py index 57189bfafc06..47a7a06bb744 100644 --- a/vllm/model_executor/model_loader/bitsandbytes_loader.py +++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py @@ -6,7 +6,8 @@ import itertools import math import os -from typing import Any, Callable, Dict, Generator, List, Optional, Tuple +from collections.abc import Generator +from typing import Any, Callable, Optional import numpy as np import torch @@ -49,21 +50,21 @@ def __init__(self, load_config: LoadConfig): super().__init__(load_config) # Save the module names without sharding. - self.unsharded_weights_modules: List[str] = [] + self.unsharded_weights_modules: list[str] = [] # Save the module names that are sharded by column. - self.column_sharded_weights_modules: List[str] = [] + self.column_sharded_weights_modules: list[str] = [] # Store all module names (from transformers) that support # BNB quantization. - self.target_modules: List[str] = [] + self.target_modules: list[str] = [] # mapping weight names from transformers to vllm. self.weight_mapper: Callable = lambda name: name def _get_weight_files( self, model_name_or_path: str, - allowed_patterns: List[str], + allowed_patterns: list[str], revision: Optional[str] = None, - ) -> Tuple[str, List[str], str]: + ) -> tuple[str, list[str], str]: """Retrieve weight files. Download the files if necessary. Return the weight files and the file pattern.""" @@ -95,7 +96,7 @@ def _get_weight_files( f"No model weights found in: `{model_name_or_path}`") def _prepare_weights(self, model_name_or_path: str, - revision: Optional[str]) -> Tuple[List[str], bool]: + revision: Optional[str]) -> tuple[list[str], bool]: """Prepare weight files for the model.""" allowed_patterns = ["*.safetensors", "*.bin", "*.pt"] @@ -155,7 +156,7 @@ def _get_quantized_weights_iterator( revision: Optional[str], pre_quant: bool, load_8bit: bool, - ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str, + ) -> tuple[Generator[tuple[str, torch.Tensor], None, None], dict[str, Any]]: """Get an iterator to the model weights with bitsandbytes quantization, as well as the quantization state dictionary.""" @@ -175,7 +176,7 @@ def _get_quantized_weights_iterator( hf_weights_files, use_safetensors = self._prepare_weights( model_name_or_path, revision) - quant_state_dict: Dict[str, Any] = {} + quant_state_dict: dict[str, Any] = {} if pre_quant: if load_8bit: @@ -257,7 +258,7 @@ def _quantized_4bit_generator(self, hf_weights_files, use_safetensors, # Closure to parse quant_state for each prequant weight def _parse_quant_state(param_name: str, - temp_state_dict: Dict) -> QuantState: + temp_state_dict: dict) -> QuantState: quant_state = {} for k in temp_state_dict: if param_name + "." in k: @@ -415,7 +416,7 @@ def _load_weights(self, model_config: ModelConfig, # Modules whose weights might have fused on disk # we need their output_sizes to make shard in flight correctly with TP - self.maybe_fused_weights_modules: Dict[str, List[int]] = {} + self.maybe_fused_weights_modules: dict[str, list[int]] = {} self._get_bnb_target_modules(model) for name, module in model.named_modules(): # Some modules like `ReplicatedLinear` should not have their weights @@ -480,7 +481,7 @@ def _load_weights(self, model_config: ModelConfig, torch.cuda.empty_cache() param_dict = dict(model.named_parameters()) - stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {} + stacked_quant_state_dict: dict[str, dict[int, Any]] = {} # TODO: Change this lazy import to normal import # after the checks are updated to run on a new version from vllm.model_executor.models.utils import is_pp_missing_parameter diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py index c8bc4aecaecf..21eb7d8a75fb 100644 --- a/vllm/model_executor/model_loader/default_loader.py +++ b/vllm/model_executor/model_loader/default_loader.py @@ -3,7 +3,8 @@ import glob import os import time -from typing import Generator, Iterable, List, Optional, Tuple, cast +from collections.abc import Generator, Iterable +from typing import Optional, cast import huggingface_hub import torch @@ -92,7 +93,7 @@ def _prepare_weights( revision: Optional[str], fall_back_to_pt: bool, allow_patterns_overrides: Optional[list[str]], - ) -> Tuple[str, List[str], bool]: + ) -> tuple[str, list[str], bool]: """Prepare weights for the model. If the model is not local, it will be downloaded.""" @@ -138,7 +139,7 @@ def _prepare_weights( else: hf_folder = model_name_or_path - hf_weights_files: List[str] = [] + hf_weights_files: list[str] = [] for pattern in allow_patterns: hf_weights_files += glob.glob(os.path.join(hf_folder, pattern)) if len(hf_weights_files) > 0: @@ -173,7 +174,7 @@ def _prepare_weights( def _get_weights_iterator( self, source: "Source" - ) -> Generator[Tuple[str, torch.Tensor], None, None]: + ) -> Generator[tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_folder, hf_weights_files, use_safetensors = self._prepare_weights( source.model_or_path, source.revision, source.fall_back_to_pt, @@ -238,7 +239,7 @@ def get_all_weights( self, model_config: ModelConfig, model: nn.Module, - ) -> Generator[Tuple[str, torch.Tensor], None, None]: + ) -> Generator[tuple[str, torch.Tensor], None, None]: primary_weights = DefaultModelLoader.Source( model_config.model, model_config.revision, diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index ace1cd371286..2766c9787b83 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Dict, Generator, Tuple +from collections.abc import Generator import gguf import torch @@ -84,8 +84,8 @@ def _get_gguf_weights_map(self, model_config: ModelConfig): return gguf_to_hf_name_map def _get_weights_iterator( - self, model_name_or_path: str, gguf_to_hf_name_map: Dict[str, str] - ) -> Generator[Tuple[str, torch.Tensor], None, None]: + self, model_name_or_path: str, gguf_to_hf_name_map: dict[str, str] + ) -> Generator[tuple[str, torch.Tensor], None, None]: return gguf_quant_weights_iterator(model_name_or_path, gguf_to_hf_name_map) diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index e4a48483764a..e65d16cae76c 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -5,7 +5,7 @@ import copy import importlib import os -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch import torch.nn as nn @@ -33,7 +33,7 @@ } # Models supported by Neuron. -_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = { +_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str, str]] = { "LlamaForCausalLM": ("transformers_neuronx.llama.model", "LlamaForSampling", "LlamaForCausalLM"), "MistralForCausalLM": ("transformers_neuronx.mistral.model", @@ -146,7 +146,7 @@ def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: batch_size, num_steps = logits.shape seq_ids = [ seq_id for sg in sampling_metadata.seq_groups @@ -188,7 +188,7 @@ def _get_model_architecture(config: PretrainedConfig) -> str: f"{list(_NEURON_SUPPORTED_MODELS.keys())}") -def _get_buckets(env: str, default_value: List[int]) -> List[int]: +def _get_buckets(env: str, default_value: list[int]) -> list[int]: env_value = os.getenv(env) if env_value is None: return default_value @@ -464,7 +464,7 @@ def get_neuron_eagle_speculation_model(model_config: ModelConfig, draft_model.eval() - token_tree: Dict[int, List[int]] = ast.literal_eval( + token_tree: dict[int, list[int]] = ast.literal_eval( speculation_config.speculative_token_tree) speculation_model = EagleSpeculativeDecoder(draft_model.model, diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py index f879c99ac2ef..1c4f66061d1d 100644 --- a/vllm/model_executor/model_loader/neuronx_distributed.py +++ b/vllm/model_executor/model_loader/neuronx_distributed.py @@ -9,7 +9,7 @@ import multiprocessing import os import shutil -from typing import Dict, List, Optional, Tuple +from typing import Optional import torch import torch.nn as nn @@ -46,7 +46,7 @@ } # Models supported by Neuronx distributed for inference. -_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str]] = { +_NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = { "LlamaForCausalLM": ("neuronx_distributed_inference.models.llama.modeling_llama", "NeuronLlamaForCausalLM"), @@ -365,7 +365,7 @@ def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> Optional[List[SamplerOutput]]: + ) -> Optional[list[SamplerOutput]]: batch_size, num_steps = logits.shape seq_ids = [ seq_id for sg in sampling_metadata.seq_groups diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py index 1fbb5ca56644..a695ba03bd1d 100644 --- a/vllm/model_executor/model_loader/runai_streamer_loader.py +++ b/vllm/model_executor/model_loader/runai_streamer_loader.py @@ -2,7 +2,8 @@ # ruff: noqa: SIM117 import glob import os -from typing import Generator, List, Optional, Tuple +from collections.abc import Generator +from typing import Optional import torch from torch import nn @@ -48,7 +49,7 @@ def __init__(self, load_config: LoadConfig): os.environ["RUNAI_STREAMER_S3_ENDPOINT"] = aws_endpoint_url def _prepare_weights(self, model_name_or_path: str, - revision: Optional[str]) -> List[str]: + revision: Optional[str]) -> list[str]: """Prepare weights for the model. If the model is not local, it will be downloaded.""" @@ -87,7 +88,7 @@ def _prepare_weights(self, model_name_or_path: str, def _get_weights_iterator( self, model_or_path: str, - revision: str) -> Generator[Tuple[str, torch.Tensor], None, None]: + revision: str) -> Generator[tuple[str, torch.Tensor], None, None]: """Get an iterator for the model weights based on the load format.""" hf_weights_files = self._prepare_weights(model_or_path, revision) return runai_safetensors_weights_iterator( diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py index 152a3d699726..913bda7e007a 100644 --- a/vllm/model_executor/model_loader/sharded_state_loader.py +++ b/vllm/model_executor/model_loader/sharded_state_loader.py @@ -3,7 +3,8 @@ import collections import glob import os -from typing import Any, Dict, Generator, List, Optional, Tuple +from collections.abc import Generator +from typing import Any, Optional import torch from torch import nn @@ -48,12 +49,12 @@ def __init__(self, @staticmethod def _filter_subtensors( - tensors: Dict[str, torch.Tensor], ) -> Dict[str, torch.Tensor]: + tensors: dict[str, torch.Tensor], ) -> dict[str, torch.Tensor]: """ Filter out all tensors that share the same memory or a subset of the memory of another tensor. """ - same_storage_groups: Dict[Any, List[Tuple[str, torch.Tensor]]] = ( + same_storage_groups: dict[Any, list[tuple[str, torch.Tensor]]] = ( collections.defaultdict(list)) for key, tensor in tensors.items(): if tensor.numel(): @@ -63,7 +64,7 @@ def _filter_subtensors( def get_end_ptr(tensor: torch.Tensor) -> int: return tensor.view(-1)[-1].data_ptr() + tensor.element_size() - result: Dict[str, torch.Tensor] = {} + result: dict[str, torch.Tensor] = {} for group in same_storage_groups.values(): for k, t in group: a, b = t.data_ptr(), get_end_ptr(t) @@ -160,7 +161,7 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module: return model.eval() def iterate_over_files( - self, paths) -> Generator[Tuple[str, torch.Tensor], None, None]: + self, paths) -> Generator[tuple[str, torch.Tensor], None, None]: if self.runai_model_streamer: yield from runai_safetensors_weights_iterator(paths, True) else: @@ -188,7 +189,7 @@ def save_model( part_idx = 0 total_size = 0 state_dict = ShardedStateLoader._filter_subtensors(model.state_dict()) - state_dict_part: Dict[str, torch.Tensor] = {} + state_dict_part: dict[str, torch.Tensor] = {} for key, tensor in state_dict.items(): param_size = tensor.nelement() * tensor.element_size() if max_size is not None and total_size + param_size > max_size: diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py index 117251ccf05f..0ff35b3a6dca 100644 --- a/vllm/model_executor/model_loader/tensorizer.py +++ b/vllm/model_executor/model_loader/tensorizer.py @@ -6,9 +6,10 @@ import os import re import time +from collections.abc import Generator from dataclasses import dataclass from functools import partial -from typing import BinaryIO, Generator, Optional, Tuple, Type, Union +from typing import BinaryIO, Optional, Union import torch from torch import nn @@ -67,7 +68,7 @@ class TensorizerConfig: s3_access_key_id: Optional[str] = None s3_secret_access_key: Optional[str] = None s3_endpoint: Optional[str] = None - model_class: Optional[Type[torch.nn.Module]] = None + model_class: Optional[type[torch.nn.Module]] = None hf_config: Optional[PretrainedConfig] = None dtype: Optional[Union[str, torch.dtype]] = None _is_sharded: bool = False @@ -365,7 +366,7 @@ def deserialize(self): def tensorizer_weights_iterator( tensorizer_args: "TensorizerArgs" -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: logger.warning("Deserializing HuggingFace models is not optimized for " "loading on vLLM, as tensorizer is forced to load to CPU. " "Consider deserializing a vLLM model instead for faster " diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py index 7cf3940ab644..4107e741fd8f 100644 --- a/vllm/model_executor/model_loader/tensorizer_loader.py +++ b/vllm/model_executor/model_loader/tensorizer_loader.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # ruff: noqa: SIM117 import copy -from typing import Generator, Tuple +from collections.abc import Generator import torch from torch import nn @@ -36,7 +36,7 @@ def _verify_config(self, model_config: ModelConfig, self.tensorizer_config.verify_with_parallel_config(parallel_config) def _get_weights_iterator( - self, ) -> Generator[Tuple[str, torch.Tensor], None, None]: + self, ) -> Generator[tuple[str, torch.Tensor], None, None]: tensorizer_args = self.tensorizer_config._construct_tensorizer_args() return tensorizer_weights_iterator(tensorizer_args) diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py index ddc857aebdc8..68b1f1ad74d3 100644 --- a/vllm/model_executor/model_loader/utils.py +++ b/vllm/model_executor/model_loader/utils.py @@ -5,7 +5,7 @@ import warnings from contextlib import contextmanager from dataclasses import dataclass, field -from typing import Dict, List, Optional, Tuple, Type +from typing import Optional import torch import transformers @@ -124,7 +124,7 @@ def device_loading_context(module: torch.nn.Module, yield module return - original_device_states: Dict[str, torch.device] = {} + original_device_states: dict[str, torch.device] = {} # Store original device states and move parameters to GPU if they're on CPU for name, p in module.named_parameters(): @@ -214,7 +214,7 @@ def resolve_transformers_arch(model_config: ModelConfig, def get_model_architecture( - model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: + model_config: ModelConfig) -> tuple[type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) # Special handling for quantized Mixtral. @@ -257,8 +257,8 @@ class ParamMapping: It creates a bidirectional mapping between packed parameters and their constituent parts. """ - packed_mapping: Dict[str, List[str]] - inverse_packed_mapping: Dict[str, Tuple[str, + packed_mapping: dict[str, list[str]] + inverse_packed_mapping: dict[str, tuple[str, int]] = field(default_factory=dict) def __post_init__(self): @@ -273,7 +273,7 @@ def __post_init__(self): ) def get_sub_modules(self, - module_name: str) -> Optional[Tuple[str, List[str]]]: + module_name: str) -> Optional[tuple[str, list[str]]]: for key, value in self.packed_mapping.items(): if module_name.endswith(key): return key, value @@ -281,7 +281,7 @@ def get_sub_modules(self, def configure_quant_config(quant_config: QuantizationConfig, - model_class: Type[nn.Module]): + model_class: type[nn.Module]): """ Pass packed_modules_mapping by reference to quant_config so that quant_config can properly match fused modules diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 8f9d809022aa..a1cf43328bab 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -8,8 +8,9 @@ import tempfile import time from collections import defaultdict +from collections.abc import Generator from pathlib import Path -from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import filelock import gguf @@ -221,7 +222,7 @@ def get_sparse_attention_config( model_config: ModelConfig, load_config: LoadConfig, sparse_attention_config_filename: str = "sparse_attention_config.json", -) -> Dict[str, Any]: +) -> dict[str, Any]: model_name_or_path = model_config.model is_local = os.path.isdir(model_name_or_path) if not is_local: @@ -253,9 +254,9 @@ def get_sparse_attention_config( def download_weights_from_hf( model_name_or_path: str, cache_dir: Optional[str], - allow_patterns: List[str], + allow_patterns: list[str], revision: Optional[str] = None, - ignore_patterns: Optional[Union[str, List[str]]] = None, + ignore_patterns: Optional[Union[str, list[str]]] = None, ) -> str: """Download model weights from Hugging Face Hub. @@ -263,11 +264,11 @@ def download_weights_from_hf( model_name_or_path (str): The model name or path. cache_dir (Optional[str]): The cache directory to store the model weights. If None, will use HF defaults. - allow_patterns (List[str]): The allowed patterns for the + allow_patterns (list[str]): The allowed patterns for the weight files. Files matched by any of the patterns will be downloaded. revision (Optional[str]): The revision of the model. - ignore_patterns (Optional[Union[str, List[str]]]): The patterns to + ignore_patterns (Optional[Union[str, list[str]]]): The patterns to filter out the weight files. Files matched by any of the patterns will be ignored. @@ -347,9 +348,9 @@ def download_safetensors_index_file_from_hf( # Passing both of these to the weight loader functionality breaks. # So, we use the index_file to # look up which safetensors files should be used. -def filter_duplicate_safetensors_files(hf_weights_files: List[str], +def filter_duplicate_safetensors_files(hf_weights_files: list[str], hf_folder: str, - index_file: str) -> List[str]: + index_file: str) -> list[str]: # model.safetensors.index.json is a mapping from keys in the # torch state_dict to safetensors file holding that weight. index_file_name = os.path.join(hf_folder, index_file) @@ -372,7 +373,7 @@ def filter_duplicate_safetensors_files(hf_weights_files: List[str], def filter_files_not_needed_for_inference( - hf_weights_files: List[str]) -> List[str]: + hf_weights_files: list[str]) -> list[str]: """ Exclude files that are not needed for inference. @@ -408,9 +409,9 @@ def np_cache_weights_iterator( model_name_or_path: str, cache_dir: Optional[str], hf_folder: str, - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model np files. Will dump the model weights to numpy files if they are not already dumped. @@ -424,7 +425,7 @@ def np_cache_weights_iterator( # dumping the same model weights to numpy at the same time. with get_lock(model_name_or_path, cache_dir): if not os.path.exists(weight_names_file): - weight_names: List[str] = [] + weight_names: list[str] = [] for bin_file in tqdm( hf_weights_files, desc="Loading np_cache checkpoint shards", @@ -453,9 +454,9 @@ def np_cache_weights_iterator( def safetensors_weights_iterator( - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" for st_file in tqdm( hf_weights_files, @@ -470,9 +471,9 @@ def safetensors_weights_iterator( def runai_safetensors_weights_iterator( - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" with SafetensorsStreamer() as streamer: for st_file in tqdm( @@ -486,9 +487,9 @@ def runai_safetensors_weights_iterator( def fastsafetensors_weights_iterator( - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files using fastsafetensor library.""" if torch.distributed.is_initialized(): @@ -525,10 +526,10 @@ def fastsafetensors_weights_iterator( def pt_weights_iterator( - hf_weights_files: List[str], + hf_weights_files: list[str], use_tqdm_on_load: bool, pt_load_map_location: Union[str, dict[str, str]] = "cpu", -) -> Generator[Tuple[str, torch.Tensor], None, None]: +) -> Generator[tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model bin/pt files.""" for bin_file in tqdm( hf_weights_files, @@ -544,7 +545,7 @@ def pt_weights_iterator( def get_gguf_extra_tensor_names( - gguf_file: str, gguf_to_hf_name_map: Dict[str, str]) -> List[str]: + gguf_file: str, gguf_to_hf_name_map: dict[str, str]) -> list[str]: reader = gguf.GGUFReader(gguf_file) expected_gguf_keys = set(gguf_to_hf_name_map.keys()) exact_gguf_keys = set([tensor.name for tensor in reader.tensors]) @@ -553,8 +554,8 @@ def get_gguf_extra_tensor_names( def gguf_quant_weights_iterator( - gguf_file: str, gguf_to_hf_name_map: Dict[str, str] -) -> Generator[Tuple[str, torch.Tensor], None, None]: + gguf_file: str, gguf_to_hf_name_map: dict[str, str] +) -> Generator[tuple[str, torch.Tensor], None, None]: """ Iterate over the quant weights in the model gguf files and convert them to torch tensors From 451da4bcbdc2dcabf3e319b4a82b72674c33f4de Mon Sep 17 00:00:00 2001 From: hustxiayang Date: Thu, 15 May 2025 07:01:49 -0400 Subject: [PATCH 31/58] add tools into TokenizeChatRequest (#18187) Signed-off-by: yangxia --- tests/entrypoints/openai/test_tokenization.py | 77 +++++++++++++++++++ vllm/entrypoints/openai/protocol.py | 4 + .../openai/serving_tokenization.py | 3 + 3 files changed, 84 insertions(+) diff --git a/tests/entrypoints/openai/test_tokenization.py b/tests/entrypoints/openai/test_tokenization.py index 663b722426c5..9773f3e45b99 100644 --- a/tests/entrypoints/openai/test_tokenization.py +++ b/tests/entrypoints/openai/test_tokenization.py @@ -145,6 +145,83 @@ async def test_tokenize_chat( } +@pytest.mark.asyncio +@pytest.mark.parametrize( + "model_name,tokenizer_name", + [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")], + indirect=["tokenizer_name"], +) +async def test_tokenize_chat_with_tools( + server: RemoteOpenAIServer, + model_name: str, + tokenizer_name: str, +): + tokenizer = get_tokenizer(tokenizer_name=tokenizer_name, + tokenizer_mode="fast") + + for add_generation in [False, True]: + for add_special in [False, True]: + conversation = [{ + "role": + "user", + "content": + "What's the weather like in Paris today?", + }] + + tools = [{ + "type": "function", + "function": { + "name": "get_weather", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string" + } + }, + }, + }, + }] + + for continue_final in [False, True]: + if add_generation and continue_final: + continue + if continue_final: + conversation.append({ + "role": "assistant", + "content": "Sure," + }) + + prompt = tokenizer.apply_chat_template( + add_generation_prompt=add_generation, + continue_final_message=continue_final, + conversation=conversation, + tools=tools, + tokenize=False, + ) + tokens = tokenizer.encode(prompt, + add_special_tokens=add_special) + + response = requests.post( + server.url_for("tokenize"), + json={ + "add_generation_prompt": add_generation, + "continue_final_message": continue_final, + "add_special_tokens": add_special, + "messages": conversation, + "model": model_name, + "tools": tools, + }, + ) + response.raise_for_status() + + assert response.json() == { + "tokens": tokens, + "count": len(tokens), + "max_model_len": 8192, + } + + @pytest.mark.asyncio @pytest.mark.parametrize( "model_name,tokenizer_name", diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 8ac6534875dd..cd6ee3670117 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -1593,6 +1593,10 @@ class TokenizeChatRequest(OpenAIBaseModel): default=None, description=("Additional kwargs to pass to the HF processor."), ) + tools: Optional[list[ChatCompletionToolsParam]] = Field( + default=None, + description=("A list of tools the model may call."), + ) @model_validator(mode="before") @classmethod diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index 5f4678cb0e69..349e0ac9e68b 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -65,6 +65,8 @@ async def create_tokenize( tokenizer = await self.engine_client.get_tokenizer(lora_request) if isinstance(request, TokenizeChatRequest): + tool_dicts = (None if request.tools is None else + [tool.model_dump() for tool in request.tools]) ( _, request_prompts, @@ -73,6 +75,7 @@ async def create_tokenize( request, tokenizer, request.messages, + tool_dicts=tool_dicts, chat_template=request.chat_template or self.chat_template, chat_template_content_format=self. chat_template_content_format, From 01c22335baa03dbbc3ae662c76a089ff1c5b5742 Mon Sep 17 00:00:00 2001 From: Thomas Parnell Date: Thu, 15 May 2025 15:39:00 +0200 Subject: [PATCH 32/58] [Kernel] [V1] Fix performance regression for triton unified attention (#18161) Signed-off-by: Thomas Parnell Co-authored-by: Lucas Wilkinson --- .../attention/ops/triton_unified_attention.py | 4 ++-- vllm/v1/attention/backends/triton_attn.py | 19 ++++++++++++++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py index f08000a75bc7..241e84ca669d 100644 --- a/vllm/attention/ops/triton_unified_attention.py +++ b/vllm/attention/ops/triton_unified_attention.py @@ -56,11 +56,11 @@ def kernel_unified_attention_2d( stride_k_cache_0: tl.int64, # int stride_k_cache_1: tl.int64, # int stride_k_cache_2: tl.int64, # int - stride_k_cache_3: tl.int64, # int + stride_k_cache_3: tl.constexpr, # int stride_v_cache_0: tl.int64, # int stride_v_cache_1: tl.int64, # int stride_v_cache_2: tl.int64, # int - stride_v_cache_3: tl.int64, # int + stride_v_cache_3: tl.constexpr, # int query_start_len_ptr, # [num_seqs+1] BLOCK_Q: tl.constexpr, # int num_seqs: tl.int32, diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index c4922a716bc2..908bf1274125 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -1,6 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 """Attention layer with PagedAttention and Triton prefix prefill.""" -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional import torch @@ -12,10 +12,23 @@ from vllm.platforms import current_platform from vllm.v1.attention.backends.flash_attn import ( FlashAttentionMetadata, FlashAttentionMetadataBuilder) +from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.worker.block_table import BlockTable + +if TYPE_CHECKING: + from vllm.v1.worker.gpu_model_runner import GPUModelRunner logger = init_logger(__name__) +class TritonAttentionMetadataBuilder(FlashAttentionMetadataBuilder): + + def __init__(self, runner: "GPUModelRunner", kv_cache_spec: AttentionSpec, + block_table: BlockTable): + super().__init__(runner, kv_cache_spec, block_table) + self.aot_schedule = False + + class TritonAttentionBackend(AttentionBackend): accept_output_buffer: bool = True @@ -52,8 +65,8 @@ def use_cascade_attention(*args, **kwargs) -> bool: return False @staticmethod - def get_builder_cls() -> type["FlashAttentionMetadataBuilder"]: - return FlashAttentionMetadataBuilder + def get_builder_cls() -> type["TritonAttentionMetadataBuilder"]: + return TritonAttentionMetadataBuilder class TritonAttentionImpl(AttentionImpl): From 566ec04c3d3fb8e8b0876f04cd3be036d1f8d3ac Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Thu, 15 May 2025 10:49:23 -0500 Subject: [PATCH 33/58] Adding "Basic Models Test" and "Multi-Modal Models Test (Extended) 3" in AMD Pipeline (#18106) Signed-off-by: Alexei V. Ivanov Co-authored-by: Cyrus Leung --- .buildkite/scripts/hardware_ci/run-amd-test.sh | 8 ++++++++ .buildkite/test-pipeline.yaml | 6 +++--- requirements/rocm-test.txt | 6 ++++++ tests/models/test_transformers.py | 8 ++++++++ 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 97dcc42312f6..bbc896ec6819 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -82,6 +82,14 @@ if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"} fi +if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then + commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"} +fi + +if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then + commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"} +fi + if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"} fi diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1eb3e1f4c482..1459156f63db 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -455,7 +455,7 @@ steps: ##### models test ##### - label: Basic Models Test # 24min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] torch_nightly: true source_file_dependencies: - vllm/ @@ -527,7 +527,7 @@ steps: - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - label: Multi-Modal Models Test (Extended) 3 - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] optional: true source_file_dependencies: - vllm/ @@ -537,7 +537,7 @@ steps: - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - label: Quantized Models Test - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] source_file_dependencies: - vllm/model_executor/layers/quantization - tests/models/quantization diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index abd4212c6e35..25f950a99ece 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -22,4 +22,10 @@ decord==0.6.0 #sentence-transformers # required by entrypoints/openai/test_score.py sentence-transformers==3.4.1 +# Basic Models Test +matplotlib==3.10.3 + +# Multi-Modal Models Test (Extended) 3 +blobfile==3.0.0 + diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py index 6da488897be5..6e38c4c7cadb 100644 --- a/tests/models/test_transformers.py +++ b/tests/models/test_transformers.py @@ -2,6 +2,8 @@ """Test the functionality of the Transformers backend.""" import pytest +from vllm.platforms import current_platform + from ..conftest import HfRunner, VllmRunner from ..utils import multi_gpu_test from .utils import check_logprobs_close @@ -33,6 +35,9 @@ def check_implementation( ) +@pytest.mark.skipif( + current_platform.is_rocm(), + reason="Llama-3.2-1B-Instruct, Ilama-3.2-1B produce memory access fault.") @pytest.mark.parametrize( "model,model_impl", [ @@ -64,6 +69,9 @@ def test_distributed( "meta-llama/Llama-3.2-1B-Instruct", **kwargs) +@pytest.mark.skipif( + current_platform.is_rocm(), + reason="bitsandbytes quantization is currently not supported in rocm.") @pytest.mark.parametrize("model, quantization_kwargs", [ ( "meta-llama/Llama-3.2-1B-Instruct", From 51ff154639a31973e6a33fa6208c2b50a88d62e2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 15 May 2025 16:57:49 +0100 Subject: [PATCH 34/58] Improve examples rendering in docs and GitHub (#18203) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- .../disaggregated-prefill-v1/README.md | 9 +++++++++ .../openai_batch.md => openai_batch/README.md} | 18 +++++++++--------- .../openai_example_batch.jsonl | 0 .../disaggregated_serving/README.md | 8 ++++++++ .../disagg_proxy_demo.py | 2 +- .../{ => disaggregated_serving}/kv_events.sh | 0 .../opentelemetry/{Otel.md => README.md} | 0 7 files changed, 27 insertions(+), 10 deletions(-) create mode 100644 examples/offline_inference/disaggregated-prefill-v1/README.md rename examples/offline_inference/{openai/openai_batch.md => openai_batch/README.md} (94%) rename examples/offline_inference/{openai => openai_batch}/openai_example_batch.jsonl (100%) create mode 100644 examples/online_serving/disaggregated_serving/README.md rename examples/online_serving/{disagg_examples => disaggregated_serving}/disagg_proxy_demo.py (99%) rename examples/online_serving/{ => disaggregated_serving}/kv_events.sh (100%) rename examples/online_serving/opentelemetry/{Otel.md => README.md} (100%) diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/offline_inference/disaggregated-prefill-v1/README.md new file mode 100644 index 000000000000..f708eb253838 --- /dev/null +++ b/examples/offline_inference/disaggregated-prefill-v1/README.md @@ -0,0 +1,9 @@ +# Disaggregated Prefill V1 + +This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM. + +## Files + +- `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially. +- `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`. +- `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`. diff --git a/examples/offline_inference/openai/openai_batch.md b/examples/offline_inference/openai_batch/README.md similarity index 94% rename from examples/offline_inference/openai/openai_batch.md rename to examples/offline_inference/openai_batch/README.md index d271573aa96f..42a19f71e9de 100644 --- a/examples/offline_inference/openai/openai_batch.md +++ b/examples/offline_inference/openai_batch/README.md @@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format The OpenAI batch file format consists of a series of json objects on new lines. -[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl) +[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl) Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. @@ -30,13 +30,13 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ```console -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ```console -$ cat offline_inference/openai/openai_example_batch.jsonl +$ cat offline_inference/openai_batch/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -48,7 +48,7 @@ The batch running tool is designed to be used from the command line. You can run the batch with the following command, which will write its results to a file called `results.jsonl` ```console -python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ### Step 3: Check your results @@ -65,10 +65,10 @@ $ cat results.jsonl The batch runner supports remote input and output urls that are accessible via http/https. -For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run +For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run ```console -python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct +python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct ``` ## Example 3: Integrating with AWS S3 @@ -89,13 +89,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. To follow along with this example, you can download the example batch, or create your own batch file in your working directory. ```console -wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl +wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl ``` Once you've created your batch file it should look like this ```console -$ cat offline_inference/openai/openai_example_batch.jsonl +$ cat offline_inference/openai_batch/openai_example_batch.jsonl {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} ``` @@ -103,7 +103,7 @@ $ cat offline_inference/openai/openai_example_batch.jsonl Now upload your batch file to your S3 bucket. ```console -aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl +aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl ``` ### Step 2: Generate your presigned urls diff --git a/examples/offline_inference/openai/openai_example_batch.jsonl b/examples/offline_inference/openai_batch/openai_example_batch.jsonl similarity index 100% rename from examples/offline_inference/openai/openai_example_batch.jsonl rename to examples/offline_inference/openai_batch/openai_example_batch.jsonl diff --git a/examples/online_serving/disaggregated_serving/README.md b/examples/online_serving/disaggregated_serving/README.md new file mode 100644 index 000000000000..090afd7515ee --- /dev/null +++ b/examples/online_serving/disaggregated_serving/README.md @@ -0,0 +1,8 @@ +# Disaggregated Serving + +This example contains scripts that demonstrate the disaggregated serving features of vLLM. + +## Files + +- `disagg_proxy_demo.py` - Demonstrates XpYd (X prefill instances, Y decode instances). +- `kv_events.sh` - Demonstrates KV cache event publishing. diff --git a/examples/online_serving/disagg_examples/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py similarity index 99% rename from examples/online_serving/disagg_examples/disagg_proxy_demo.py rename to examples/online_serving/disaggregated_serving/disagg_proxy_demo.py index a701636f357a..1bf4d50e2c92 100644 --- a/examples/online_serving/disagg_examples/disagg_proxy_demo.py +++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py @@ -4,7 +4,7 @@ example usage of XpYd disaggregated prefilling. We can launch multiple vllm instances (2 for prefill and 2 for decode), and launch this proxy demo through: - python3 examples/online_serving/disagg_examples/disagg_proxy_demo.py \ + python3 examples/online_serving/disaggregated_serving/disagg_proxy_demo.py \ --model $model_name \ --prefill localhost:8100 localhost:8101 \ --decode localhost:8200 localhost:8201 \ diff --git a/examples/online_serving/kv_events.sh b/examples/online_serving/disaggregated_serving/kv_events.sh similarity index 100% rename from examples/online_serving/kv_events.sh rename to examples/online_serving/disaggregated_serving/kv_events.sh diff --git a/examples/online_serving/opentelemetry/Otel.md b/examples/online_serving/opentelemetry/README.md similarity index 100% rename from examples/online_serving/opentelemetry/Otel.md rename to examples/online_serving/opentelemetry/README.md From 2aa5470ac586f1603eb9d9b30d6c41ba3b5b9cd4 Mon Sep 17 00:00:00 2001 From: Sebastian Schoennenbeck Date: Thu, 15 May 2025 18:00:21 +0200 Subject: [PATCH 35/58] [Frontend] Fix chat template content format detection (#18190) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sebastian Schönnenbeck --- vllm/entrypoints/chat_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 6f5514a6f801..adfacf2b4719 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -387,7 +387,6 @@ def resolve_hf_chat_template( def _resolve_chat_template_content_format( chat_template: Optional[str], tools: Optional[list[dict[str, Any]]], - given_format: ChatTemplateContentFormatOption, tokenizer: AnyTokenizer, *, model_config: ModelConfig, @@ -408,7 +407,7 @@ def _resolve_chat_template_content_format( detected_format = ("string" if jinja_text is None else _detect_content_format(jinja_text, default="string")) - return detected_format if given_format == "auto" else given_format + return detected_format @lru_cache @@ -451,7 +450,6 @@ def resolve_chat_template_content_format( detected_format = _resolve_chat_template_content_format( chat_template, tools, - given_format, tokenizer, model_config=model_config, ) @@ -462,7 +460,8 @@ def resolve_chat_template_content_format( detected_format=detected_format, ) - return detected_format + return detected_format if given_format == "auto" else given_format + ModalityStr = Literal["image", "audio", "video", "image_embeds"] From fadb8d5c2df1c24d891aeccfb0b11de6e03e9f27 Mon Sep 17 00:00:00 2001 From: Zhonghua Deng Date: Fri, 16 May 2025 00:01:47 +0800 Subject: [PATCH 36/58] [Bugfix]Change the exception thrown by call_hf_processor from RuntimeError to ValueError (#18181) Signed-off-by: Abatom --- vllm/inputs/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index aecddbcd7515..148b3558c15e 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -159,7 +159,7 @@ def call_hf_processor( msg = (f"Failed to apply {type(hf_processor).__name__} " f"on data={data} with kwargs={merged_kwargs}") - raise RuntimeError(msg) from exc + raise ValueError(msg) from exc class DummyData(NamedTuple): From 92540529c051fe6e8f111d7688ffac84ac561a0d Mon Sep 17 00:00:00 2001 From: TJian Date: Fri, 16 May 2025 00:53:18 +0800 Subject: [PATCH 37/58] [Bugfix] [ROCm]: Remove assertion logic when using AITER fused moe in unquantizedMethod to reenable LLama4 BF16 (#18205) Signed-off-by: tjtanaa --- vllm/model_executor/layers/fused_moe/layer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 0b3c02d1ba28..f1cb77f64eae 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -503,7 +503,6 @@ def forward_cuda( indices_type=torch.uint32 if self.moe.use_pplx_kernels else None) if self.rocm_aiter_moe_enabled: - assert not apply_router_weight_on_input assert expert_map is None return self.rocm_aiter_fused_experts( hidden_states=x, From e3f3aee6f4206df0c338709614056ce2860ef039 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= Date: Thu, 15 May 2025 18:59:38 +0200 Subject: [PATCH 38/58] [Misc] Avoid cuda graph log when sizes still match (#18202) Signed-off-by: NickLucche --- vllm/config.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 19de4d0549b6..dddfdabd126a 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -3950,11 +3950,12 @@ def init_with_cudagraph_sizes(self, self.cudagraph_capture_sizes = cudagraph_capture_sizes else: # de-duplicate the sizes provided by the config - self.cudagraph_capture_sizes = list( - set(self.cudagraph_capture_sizes)) - logger.info(("cudagraph sizes specified by model runner" - " %s is overridden by config %s"), - cudagraph_capture_sizes, self.cudagraph_capture_sizes) + dedup_sizes = list(set(self.cudagraph_capture_sizes)) + if len(dedup_sizes) < len(self.cudagraph_capture_sizes): + logger.info(("cudagraph sizes specified by model runner" + " %s is overridden by config %s"), + cudagraph_capture_sizes, dedup_sizes) + self.cudagraph_capture_sizes = dedup_sizes computed_compile_sizes = [] if self.compile_sizes is not None: From 0b34593017953051b3225b1483ce0f4670e3eb0e Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Thu, 15 May 2025 13:01:25 -0500 Subject: [PATCH 39/58] Adding "AMD: Tensorizer Test" to amdproduction. (#18216) --- .buildkite/test-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1459156f63db..c4459741712d 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -379,7 +379,7 @@ steps: - pytest -v -s kernels/mamba - label: Tensorizer Test # 11min - mirror_hardwares: [amdexperimental] + mirror_hardwares: [amdexperimental, amdproduction] soft_fail: true source_file_dependencies: - vllm/model_executor/model_loader From 8795eb9975561e19fe642b39b42e0c18280ac796 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Thu, 15 May 2025 15:59:42 -0700 Subject: [PATCH 40/58] [Bugfix] Fix test_eagle test (#18223) Signed-off-by: Lucia Fang --- tests/v1/spec_decode/test_eagle.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index a7e148d01cad..7d93a44c5059 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -115,14 +115,15 @@ def test_prepare_inputs(): ("eagle3", lambda k: _create_proposer("eagle3", k), eagle3_dir, ('model', 'embed_tokens')), ]) +@mock.patch('vllm.v1.spec_decode.eagle.get_pp_group') @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config') @mock.patch('vllm.v1.spec_decode.eagle.ModelRegistry') @mock.patch('vllm.v1.spec_decode.eagle.get_model_loader') @mock.patch('vllm.v1.spec_decode.eagle.set_default_torch_dtype') @mock.patch('vllm.v1.spec_decode.eagle.set_current_vllm_config') def test_load_model(mock_set_config, mock_set_dtype, mock_get_loader, - mock_registry, mock_get_layers, method, proposer_helper, - draft_model_dir, target_attribute_path): + mock_registry, mock_get_layers, mock_get_pp_group, method, + proposer_helper, draft_model_dir, target_attribute_path): # Setup mock for model class mock_model_cls = mock.MagicMock() @@ -158,6 +159,11 @@ def __exit__(self, exc_type, exc_val, exc_tb): # Make mock_get_layers return different values for each call mock_get_layers.side_effect = [target_attn_layers, all_attn_layers] + # Setup mock for pp group to return the appropriate value for world size + mock_pp_group = mock.MagicMock() + mock_pp_group.world_size = 2 if method == "eagle" else 1 + mock_get_pp_group.return_value = mock_pp_group + # Setup model loader mock mock_loader = mock.MagicMock() mock_get_loader.return_value = mock_loader From c7852a6d9bc0a2d5ea075060af4c15a7494499b6 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 15 May 2025 19:41:55 -0400 Subject: [PATCH 41/58] [Build] Allow shipping PTX on a per-file basis (#18155) Signed-off-by: Lucas Wilkinson --- CMakeLists.txt | 9 +++-- cmake/utils.cmake | 89 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 75 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fed6e11e5ef8..a6c54be9530b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -301,7 +301,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # Only build Marlin kernels if we are building for at least some compatible archs. # Keep building Marlin for 9.0 as there are some group sizes and shapes that # are not supported by Machete yet. - cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") + # 9.0 for latest bf16 atomicAdd PTX + cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}") if (MARLIN_ARCHS) # @@ -445,8 +446,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") # # For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x) # kernels for the remaining archs that are not already built for 3x. + # (Build 8.9 for FP8) cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS - "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") + "7.5;8.0;8.9+PTX" "${CUDA_ARCHS}") # subtract out the archs that are already built for 3x list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) if (SCALED_MM_2X_ARCHS) @@ -675,7 +677,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") CUDA_ARCHS "${CUDA_ARCHS}") list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}") - cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}") + # 9.0 for latest bf16 atomicAdd PTX + cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}") if (MARLIN_MOE_ARCHS) # diff --git a/cmake/utils.cmake b/cmake/utils.cmake index c9cd099b82a7..12e4e39024f5 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -228,11 +228,26 @@ macro(set_gencode_flags_for_srcs) "${multiValueArgs}" ${ARGN} ) foreach(_ARCH ${arg_CUDA_ARCHS}) - string(REPLACE "." "" _ARCH "${_ARCH}") - set_gencode_flag_for_srcs( - SRCS ${arg_SRCS} - ARCH "compute_${_ARCH}" - CODE "sm_${_ARCH}") + # handle +PTX suffix: generate both sm and ptx codes if requested + string(FIND "${_ARCH}" "+PTX" _HAS_PTX) + if(NOT _HAS_PTX EQUAL -1) + string(REPLACE "+PTX" "" _BASE_ARCH "${_ARCH}") + string(REPLACE "." "" _STRIPPED_ARCH "${_BASE_ARCH}") + set_gencode_flag_for_srcs( + SRCS ${arg_SRCS} + ARCH "compute_${_STRIPPED_ARCH}" + CODE "sm_${_STRIPPED_ARCH}") + set_gencode_flag_for_srcs( + SRCS ${arg_SRCS} + ARCH "compute_${_STRIPPED_ARCH}" + CODE "compute_${_STRIPPED_ARCH}") + else() + string(REPLACE "." "" _STRIPPED_ARCH "${_ARCH}") + set_gencode_flag_for_srcs( + SRCS ${arg_SRCS} + ARCH "compute_${_STRIPPED_ARCH}" + CODE "sm_${_STRIPPED_ARCH}") + endif() endforeach() if (${arg_BUILD_PTX_FOR_ARCH}) @@ -251,7 +266,10 @@ endmacro() # # For the given `SRC_CUDA_ARCHS` list of gencode versions in the form # `.[letter]` compute the "loose intersection" with the -# `TGT_CUDA_ARCHS` list of gencodes. +# `TGT_CUDA_ARCHS` list of gencodes. We also support the `+PTX` suffix in +# `SRC_CUDA_ARCHS` which indicates that the PTX code should be built when there +# is a CUDA_ARCH in `TGT_CUDA_ARCHS` that is equal to or larger than the +# architecture in `SRC_CUDA_ARCHS`. # The loose intersection is defined as: # { max{ x \in tgt | x <= y } | y \in src, { x \in tgt | x <= y } != {} } # where `<=` is the version comparison operator. @@ -268,44 +286,63 @@ endmacro() # cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) # OUT_CUDA_ARCHS="8.0;8.6;9.0;9.0a" # +# Example With PTX: +# SRC_CUDA_ARCHS="8.0+PTX" +# TGT_CUDA_ARCHS="9.0" +# cuda_archs_loose_intersection(OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) +# OUT_CUDA_ARCHS="8.0+PTX" +# function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_ARCHS) - list(REMOVE_DUPLICATES SRC_CUDA_ARCHS) - set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS}) + set(_SRC_CUDA_ARCHS "${SRC_CUDA_ARCHS}") + set(_TGT_CUDA_ARCHS ${TGT_CUDA_ARCHS}) + + # handle +PTX suffix: separate base arch for matching, record PTX requests + set(_PTX_ARCHS) + foreach(_arch ${_SRC_CUDA_ARCHS}) + if(_arch MATCHES "\\+PTX$") + string(REPLACE "+PTX" "" _base "${_arch}") + list(APPEND _PTX_ARCHS "${_base}") + list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") + list(APPEND _SRC_CUDA_ARCHS "${_base}") + endif() + endforeach() + list(REMOVE_DUPLICATES _PTX_ARCHS) + list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS) # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS set(_CUDA_ARCHS) - if ("9.0a" IN_LIST SRC_CUDA_ARCHS) - list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a") - if ("9.0" IN_LIST TGT_CUDA_ARCHS_) - list(REMOVE_ITEM TGT_CUDA_ARCHS_ "9.0") + if ("9.0a" IN_LIST _SRC_CUDA_ARCHS) + list(REMOVE_ITEM _SRC_CUDA_ARCHS "9.0a") + if ("9.0" IN_LIST TGT_CUDA_ARCHS) + list(REMOVE_ITEM _TGT_CUDA_ARCHS "9.0") set(_CUDA_ARCHS "9.0a") endif() endif() - if ("10.0a" IN_LIST SRC_CUDA_ARCHS) - list(REMOVE_ITEM SRC_CUDA_ARCHS "10.0a") + if ("10.0a" IN_LIST _SRC_CUDA_ARCHS) + list(REMOVE_ITEM _SRC_CUDA_ARCHS "10.0a") if ("10.0" IN_LIST TGT_CUDA_ARCHS) - list(REMOVE_ITEM TGT_CUDA_ARCHS_ "10.0") + list(REMOVE_ITEM _TGT_CUDA_ARCHS "10.0") set(_CUDA_ARCHS "10.0a") endif() endif() - list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) + list(SORT _SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING) # for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that # is less or equal to ARCH (but has the same major version since SASS binary # compatibility is only forward compatible within the same major version). - foreach(_ARCH ${TGT_CUDA_ARCHS_}) + foreach(_ARCH ${_TGT_CUDA_ARCHS}) set(_TMP_ARCH) # Extract the major version of the target arch string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" TGT_ARCH_MAJOR "${_ARCH}") - foreach(_SRC_ARCH ${SRC_CUDA_ARCHS}) + foreach(_SRC_ARCH ${_SRC_CUDA_ARCHS}) # Extract the major version of the source arch string(REGEX REPLACE "^([0-9]+)\\..*$" "\\1" SRC_ARCH_MAJOR "${_SRC_ARCH}") - # Check major-version match AND version-less-or-equal + # Check version-less-or-equal, and allow PTX arches to match across majors if (_SRC_ARCH VERSION_LESS_EQUAL _ARCH) - if (SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR) + if (_SRC_ARCH IN_LIST _PTX_ARCHS OR SRC_ARCH_MAJOR STREQUAL TGT_ARCH_MAJOR) set(_TMP_ARCH "${_SRC_ARCH}") endif() else() @@ -321,6 +358,18 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR endforeach() list(REMOVE_DUPLICATES _CUDA_ARCHS) + + # reapply +PTX suffix to architectures that requested PTX + set(_FINAL_ARCHS) + foreach(_arch ${_CUDA_ARCHS}) + if(_arch IN_LIST _PTX_ARCHS) + list(APPEND _FINAL_ARCHS "${_arch}+PTX") + else() + list(APPEND _FINAL_ARCHS "${_arch}") + endif() + endforeach() + set(_CUDA_ARCHS ${_FINAL_ARCHS}) + set(${OUT_CUDA_ARCHS} ${_CUDA_ARCHS} PARENT_SCOPE) endfunction() From 4e1c6a02641e427a6140d33262f1467906817781 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Thu, 15 May 2025 21:32:45 -0400 Subject: [PATCH 42/58] [Bugfix] fix rotary embedding test for _get_padded_tensor_shape (#18229) Signed-off-by: Lucas Wilkinson --- tests/kernels/core/test_pos_encoding.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index 383a3c83b84a..f327deb0e549 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -152,6 +152,10 @@ def test_batched_rotary_embedding( query = torch.randn(query_shape, dtype=dtype) key = torch.randn_like(query) if use_key else None + # slice tensor if required, noop otherwise + query = query[..., :head_size] + key = key[..., :head_size] if use_key else None + # NOTE(woosuk): The reference implementation should be executed first # because the custom kernel is in-place. ref_query, ref_key = rope.forward_native(positions, query, key) From ee659e3b601e886308594b3a2ee2c6edc8d1b4c8 Mon Sep 17 00:00:00 2001 From: kliuae <17350011+kliuae@users.noreply.github.com> Date: Fri, 16 May 2025 10:30:17 +0800 Subject: [PATCH 43/58] [Bugfix][ROCm] Use `chunked_prefill_paged_decode` as fallback for V1 attention on ROCm (#18093) Signed-off-by: kf --- vllm/v1/attention/backends/triton_attn.py | 109 +++++++++++++++------- 1 file changed, 77 insertions(+), 32 deletions(-) diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index 908bf1274125..4000f93984d3 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -7,6 +7,9 @@ from vllm import _custom_ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) +from vllm.attention.ops.chunked_prefill_paged_decode import ( + chunked_prefill_paged_decode) +from vllm.attention.ops.paged_attn import PagedAttention from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.logger import init_logger from vllm.platforms import current_platform @@ -162,19 +165,40 @@ def forward( # Whenever making a change in this method, please benchmark the # performance to make sure it does not introduce any overhead. + num_queries_per_kv = query.shape[1] // key.shape[1] + use_prefill_decode_attn = (num_queries_per_kv & + (num_queries_per_kv - 1)) != 0 + num_actual_tokens = attn_metadata.num_actual_tokens - key_cache, value_cache = kv_cache.unbind(0) - torch.ops._C_cache_ops.reshape_and_cache_flash( - key, - value, - key_cache, - value_cache, - attn_metadata.slot_mapping, - self.kv_cache_dtype, - layer._k_scale, - layer._v_scale, - ) + if use_prefill_decode_attn: + key_cache, value_cache = PagedAttention.split_kv_cache( + kv_cache, self.num_kv_heads, self.head_size) + + # Reshape the input keys and values and store them in the cache. + PagedAttention.write_to_paged_cache( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) + + else: + key_cache, value_cache = kv_cache.unbind(0) + torch.ops._C_cache_ops.reshape_and_cache_flash( + key, + value, + key_cache, + value_cache, + attn_metadata.slot_mapping, + self.kv_cache_dtype, + layer._k_scale, + layer._v_scale, + ) if self.kv_cache_dtype.startswith("fp8"): key_cache = key_cache.view(self.fp8_dtype) @@ -209,26 +233,47 @@ def forward( max_seqlen_k = attn_metadata.max_seq_len block_table = attn_metadata.block_table - descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) - - unified_attention( - q=query[:num_actual_tokens], - k=key_cache, - v=value_cache, - out=output[:num_actual_tokens], - cu_seqlens_q=cu_seqlens_q, - max_seqlen_q=max_seqlen_q, - seqused_k=seqused_k, - max_seqlen_k=max_seqlen_k, - softmax_scale=self.scale, - causal=True, - alibi_slopes=self.alibi_slopes, - window_size=self.sliding_window, - block_table=block_table, - softcap=self.logits_soft_cap, - q_descale=None, # Not supported - k_descale=layer._k_scale.expand(descale_shape), - v_descale=layer._v_scale.expand(descale_shape), - ) + if use_prefill_decode_attn: + # Compute attention and update output up to `num_actual_tokens`. + chunked_prefill_paged_decode(query=query[:num_actual_tokens], + key=key[:num_actual_tokens], + value=value[:num_actual_tokens], + output=output[:num_actual_tokens], + kv_cache_dtype=self.kv_cache_dtype, + key_cache=key_cache, + value_cache=value_cache, + block_table=block_table, + query_start_loc=cu_seqlens_q, + seq_lens=seqused_k, + max_seq_len=max_seqlen_k, + max_query_len=max_seqlen_q, + k_scale=layer._k_scale, + v_scale=layer._v_scale, + alibi_slopes=self.alibi_slopes, + sliding_window=self.sliding_window[0], + sm_scale=self.scale) + + else: + descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1]) + + unified_attention( + q=query[:num_actual_tokens], + k=key_cache, + v=value_cache, + out=output[:num_actual_tokens], + cu_seqlens_q=cu_seqlens_q, + max_seqlen_q=max_seqlen_q, + seqused_k=seqused_k, + max_seqlen_k=max_seqlen_k, + softmax_scale=self.scale, + causal=True, + alibi_slopes=self.alibi_slopes, + window_size=self.sliding_window, + block_table=block_table, + softcap=self.logits_soft_cap, + q_descale=None, # Not supported + k_descale=layer._k_scale.expand(descale_shape), + v_descale=layer._v_scale.expand(descale_shape), + ) return output From f4937a51c138978928f38da6a2d3b30c53286240 Mon Sep 17 00:00:00 2001 From: Sky Lee <46676799+skylee-01@users.noreply.github.com> Date: Fri, 16 May 2025 12:05:31 +0800 Subject: [PATCH 44/58] [Model] vLLM v1 supports Medusa (#17956) Signed-off-by: lisiqi23 Signed-off-by: skylee-01 <497627264@qq.com> Co-authored-by: lisiqi23 --- vllm/engine/arg_utils.py | 5 +- vllm/model_executor/models/medusa.py | 5 +- vllm/v1/spec_decode/medusa.py | 74 ++++++++++++++++++++++++++++ vllm/v1/worker/gpu_model_runner.py | 26 ++++++++++ 4 files changed, 108 insertions(+), 2 deletions(-) create mode 100644 vllm/v1/spec_decode/medusa.py diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 240142a1c5d1..3e942b0f0ff9 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1324,19 +1324,22 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: # Only Ngram speculative decoding so far. is_ngram_enabled = False is_eagle_enabled = False + is_medusa_enabled = False if self.speculative_config is not None: # This is supported but experimental (handled below). speculative_method = self.speculative_config.get("method") if speculative_method: if speculative_method in ("ngram", "[ngram]"): is_ngram_enabled = True + elif speculative_method == "medusa": + is_medusa_enabled = True elif speculative_method in ("eagle", "eagle3"): is_eagle_enabled = True else: speculative_model = self.speculative_config.get("model") if speculative_model in ("ngram", "[ngram]"): is_ngram_enabled = True - if not (is_ngram_enabled or is_eagle_enabled): + if not (is_ngram_enabled or is_eagle_enabled or is_medusa_enabled): # Other speculative decoding methods are not supported yet. _raise_or_fallback(feature_name="Speculative Decoding", recommend_to_remove=False) diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py index ac0b281f359c..4724cbe56445 100644 --- a/vllm/model_executor/models/medusa.py +++ b/vllm/model_executor/models/medusa.py @@ -51,7 +51,10 @@ class Medusa(nn.Module): needs to have truncated_vocab_size (=k) as an attribute.""" def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: - config = vllm_config.model_config.hf_config + if hasattr(vllm_config, 'draft_model_config'): + config = vllm_config.draft_model_config.hf_config + else: + config = vllm_config.model_config.hf_config super().__init__() self.config = config self.blocks = nn.ModuleList([ diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py new file mode 100644 index 000000000000..14bc9c9e0d1a --- /dev/null +++ b/vllm/v1/spec_decode/medusa.py @@ -0,0 +1,74 @@ +# SPDX-License-Identifier: Apache-2.0 + +import torch +import torch.nn as nn + +from vllm.config import VllmConfig, set_current_vllm_config +from vllm.forward_context import set_forward_context +from vllm.logger import init_logger +from vllm.model_executor.model_loader import get_model_loader +from vllm.model_executor.model_loader.utils import set_default_torch_dtype +from vllm.model_executor.models.medusa import Medusa +from vllm.v1.sample.metadata import SamplingMetadata + +# Initialize logger +logger = init_logger(__name__) + + +class MedusaProposer: + """ + Medusa proposer class for generating token sequences + """ + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + ): + # Save config parameters + self.vllm_config = vllm_config + self.device = device + self.max_num_tokens = ( + vllm_config.scheduler_config.max_num_batched_tokens) + self.hidden_size = vllm_config.speculative_config.\ + draft_model_config.get_hidden_size( + ) + self.dtype = vllm_config.model_config.dtype + + def propose( + self, + target_hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> torch.Tensor: + # Generate blocks and compute logits + blocks = self.model(target_hidden_states) + logits = self.model.compute_logits(blocks, None) + + # Get draft tokens and transpose the result + draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits] + return [list(row) for row in zip(*draft_tokens)] + + def load_model(self, target_model: nn.Module) -> None: + # Get model loader and config + loader = get_model_loader(self.vllm_config.load_config) + draft_config = self.vllm_config.speculative_config.draft_model_config + + # Load model with proper dtype and config + with set_default_torch_dtype(draft_config.dtype), \ + set_current_vllm_config(self.vllm_config): + self.model = Medusa( + vllm_config=self.vllm_config.speculative_config).to( + self.device) + + # Load model weights + weights = loader.get_all_weights(draft_config, self.model) + self.model.load_weights(weights) + + @torch.inference_mode() + def dummy_run(self, num_tokens: int) -> None: + hidden_states = torch.zeros((self.max_num_tokens, self.hidden_size), + dtype=self.dtype, + device=self.device) + with set_forward_context(None, self.vllm_config, + num_tokens=num_tokens): + self.model(hidden_states) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 1b34a9fb0616..0788ac5adde8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -47,6 +47,7 @@ from vllm.v1.sample.rejection_sampler import RejectionSampler from vllm.v1.sample.sampler import Sampler from vllm.v1.spec_decode.eagle import EagleProposer +from vllm.v1.spec_decode.medusa import MedusaProposer from vllm.v1.spec_decode.metadata import SpecDecodeMetadata from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.utils import is_spec_decode_supported @@ -156,6 +157,10 @@ def __init__( self.device) # type: ignore if self.speculative_config.method == "eagle3": self.use_aux_hidden_state_outputs = True + elif self.speculative_config.method == "medusa": + self.drafter = MedusaProposer( + vllm_config=self.vllm_config, + device=self.device) # type: ignore else: raise ValueError("Unknown speculative decoding method: " f"{self.speculative_config.method}") @@ -1254,6 +1259,27 @@ def execute_model( assert isinstance(self.drafter, NgramProposer) spec_token_ids = self.generate_draft_token_ids( valid_sampled_token_ids, sampling_metadata) + elif self.speculative_config.method == "medusa": + assert isinstance(self.drafter, MedusaProposer) + if max_gen_len == 1: + hidden_states = sample_hidden_states + else: + indices = [] + offset = 0 + for num_draft, tokens in zip( + spec_decode_metadata.num_draft_tokens, + valid_sampled_token_ids): + indices.append(offset + len(tokens) - 1) + offset += num_draft + 1 + + indices = torch.tensor(indices, + device=sample_hidden_states.device) + hidden_states = sample_hidden_states[indices] + + spec_token_ids = self.drafter.propose( + target_hidden_states=hidden_states, + sampling_metadata=sampling_metadata, + ) elif self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) # TODO(woosuk): Refactor the loop. From b18201fe060a3ddcc088f8aea3cf1d7c4b461288 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 16 May 2025 05:05:34 +0100 Subject: [PATCH 45/58] Allow users to pass arbitrary JSON keys from CLI (#18208) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/engine/test_arg_utils.py | 4 +-- tests/test_utils.py | 25 ++++++++++++++++++ vllm/engine/arg_utils.py | 6 ++++- vllm/utils.py | 46 ++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 3 deletions(-) diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index ce8873d58d4d..05d9cfc7ab74 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -181,8 +181,8 @@ def test_get_kwargs(): # literals of literals should have merged choices assert kwargs["literal_literal"]["choices"] == [1, 2] # dict should have json tip in help - json_tip = "\n\nShould be a valid JSON string." - assert kwargs["json_tip"]["help"].endswith(json_tip) + json_tip = "Should either be a valid JSON string or JSON keys" + assert json_tip in kwargs["json_tip"]["help"] # nested config should should construct the nested config assert kwargs["nested_config"]["type"]('{"field": 2}') == NestedConfig(2) # from_cli configs should be constructed with the correct method diff --git a/tests/test_utils.py b/tests/test_utils.py index ea7db0a79c86..0b88d05efeaa 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,6 +3,7 @@ import asyncio import hashlib +import json import pickle import socket from collections.abc import AsyncIterator @@ -138,6 +139,7 @@ def parser(): parser.add_argument('--model-name') parser.add_argument('--batch-size', type=int) parser.add_argument('--enable-feature', action='store_true') + parser.add_argument('--hf-overrides', type=json.loads) return parser @@ -251,6 +253,29 @@ def test_no_model_tag(parser_with_config, cli_config_file): parser_with_config.parse_args(['serve', '--config', cli_config_file]) +def test_dict_args(parser): + args = [ + "--model-name=something.something", + "--hf-overrides.key1", + "val1", + "--hf-overrides.key2.key3", + "val2", + "--hf-overrides.key2.key4", + "val3", + "--hf-overrides.key5=val4", + ] + parsed_args = parser.parse_args(args) + assert parsed_args.model_name == "something.something" + assert parsed_args.hf_overrides == { + "key1": "val1", + "key2": { + "key3": "val2", + "key4": "val3", + }, + "key5": "val4", + } + + # yapf: enable @pytest.mark.parametrize( "callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported", diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 3e942b0f0ff9..6fdb5e6c3772 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -183,7 +183,11 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]: kwargs[name] = {"default": default, "help": help} # Set other kwargs based on the type hints - json_tip = "\n\nShould be a valid JSON string." + json_tip = """\n\nShould either be a valid JSON string or JSON keys + passed individually. For example, the following sets of arguments are + equivalent:\n\n + - `--json-arg '{"key1": "value1", "key2": {"key3": "value2"}}'`\n + - `--json-arg.key1 value1 --json-arg.key2.key3 value2`\n\n""" if dataclass_cls is not None: dataclass_init = lambda x, f=dataclass_cls: f(**json.loads(x)) # Special case for configs with a from_cli method diff --git a/vllm/utils.py b/vllm/utils.py index edfbb8c9481e..0cd90c130d3e 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -15,6 +15,7 @@ import importlib.util import inspect import ipaddress +import json import multiprocessing import os import pickle @@ -1419,6 +1420,51 @@ def parse_args( # type: ignore[override] else: processed_args.append(arg) + def create_nested_dict(keys: list[str], value: str): + """Creates a nested dictionary from a list of keys and a value. + + For example, `keys = ["a", "b", "c"]` and `value = 1` will create: + `{"a": {"b": {"c": 1}}}` + """ + nested_dict: Any = value + for key in reversed(keys): + nested_dict = {key: nested_dict} + return nested_dict + + def recursive_dict_update(original: dict, update: dict): + """Recursively updates a dictionary with another dictionary.""" + for k, v in update.items(): + if isinstance(v, dict) and isinstance(original.get(k), dict): + recursive_dict_update(original[k], v) + else: + original[k] = v + + delete = set() + dict_args: dict[str, dict] = defaultdict(dict) + for i, processed_arg in enumerate(processed_args): + if processed_arg.startswith("--") and "." in processed_arg: + if "=" in processed_arg: + processed_arg, value = processed_arg.split("=", 1) + if "." not in processed_arg: + # False positive, . was only in the value + continue + else: + value = processed_args[i + 1] + delete.add(i + 1) + key, *keys = processed_arg.split(".") + # Merge all values with the same key into a single dict + arg_dict = create_nested_dict(keys, value) + recursive_dict_update(dict_args[key], arg_dict) + delete.add(i) + # Filter out the dict args we set to None + processed_args = [ + a for i, a in enumerate(processed_args) if i not in delete + ] + # Add the dict args back as if they were originally passed as JSON + for dict_arg, dict_value in dict_args.items(): + processed_args.append(dict_arg) + processed_args.append(json.dumps(dict_value)) + return super().parse_args(processed_args, namespace) def check_port(self, value): From 6b31c84affbcd2f672915ca2ef5bb39819566441 Mon Sep 17 00:00:00 2001 From: Will Eaton Date: Fri, 16 May 2025 00:07:28 -0400 Subject: [PATCH 46/58] Throw better error for when running into k8s service discovery issue (#18209) Signed-off-by: Will Eaton --- tests/test_vllm_port.py | 35 +++++++++++++++++++++++++++++++++++ vllm/envs.py | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 3 deletions(-) create mode 100644 tests/test_vllm_port.py diff --git a/tests/test_vllm_port.py b/tests/test_vllm_port.py new file mode 100644 index 000000000000..ccbb36bf4c06 --- /dev/null +++ b/tests/test_vllm_port.py @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: Apache-2.0 + +import os +from unittest.mock import patch + +import pytest + +from vllm.envs import get_vllm_port + + +def test_get_vllm_port_not_set(): + """Test when VLLM_PORT is not set.""" + with patch.dict(os.environ, {}, clear=True): + assert get_vllm_port() is None + + +def test_get_vllm_port_valid(): + """Test when VLLM_PORT is set to a valid integer.""" + with patch.dict(os.environ, {"VLLM_PORT": "5678"}, clear=True): + assert get_vllm_port() == 5678 + + +def test_get_vllm_port_invalid(): + """Test when VLLM_PORT is set to a non-integer value.""" + with (patch.dict(os.environ, {"VLLM_PORT": "abc"}, clear=True), + pytest.raises(ValueError, match="must be a valid integer")): + get_vllm_port() + + +def test_get_vllm_port_uri(): + """Test when VLLM_PORT is set to a URI.""" + with (patch.dict(os.environ, {"VLLM_PORT": "tcp://localhost:5678"}, + clear=True), + pytest.raises(ValueError, match="appears to be a URI")): + get_vllm_port() diff --git a/vllm/envs.py b/vllm/envs.py index fe3fa91fbe33..dc23c8ea5314 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -139,6 +139,39 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: return int(value) +def get_vllm_port() -> Optional[int]: + """Get the port from VLLM_PORT environment variable. + + Returns: + The port number as an integer if VLLM_PORT is set, None otherwise. + + Raises: + ValueError: If VLLM_PORT is a URI, suggest k8s service discovery issue. + """ + if 'VLLM_PORT' not in os.environ: + return None + + port = os.getenv('VLLM_PORT', '0') + + try: + return int(port) + except ValueError as err: + from urllib.parse import urlparse + try: + parsed = urlparse(port) + if parsed.scheme: + raise ValueError( + f"VLLM_PORT '{port}' appears to be a URI. " + "This may be caused by a Kubernetes service discovery issue" + "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html" + ) + except Exception: + pass + + raise ValueError( + f"VLLM_PORT '{port}' must be a valid integer") from err + + # The begin-* and end* here are used by the documentation generator # to extract the used env vars. @@ -219,10 +252,8 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]: # Note: if VLLM_PORT is set, and some code asks for multiple ports, the # VLLM_PORT will be used as the first port, and the rest will be generated # by incrementing the VLLM_PORT value. - # '0' is used to make mypy happy 'VLLM_PORT': - lambda: int(os.getenv('VLLM_PORT', '0')) - if 'VLLM_PORT' in os.environ else None, + get_vllm_port, # path used for ipc when the frontend api server is running in # multi-processing mode to communicate with the backend engine process. From 3d2779c29a9f5003f6fec6ca07205147e2c987d1 Mon Sep 17 00:00:00 2001 From: Lucia Fang <116399278+luccafong@users.noreply.github.com> Date: Thu, 15 May 2025 22:28:27 -0700 Subject: [PATCH 47/58] [Feature] Support Pipeline Parallism in torchrun SPMD offline inference for V1 (#17827) Signed-off-by: Lucia Fang --- .buildkite/test-pipeline.yaml | 2 ++ .../offline_inference/torchrun_example.py | 23 ++++++++----- tests/distributed/test_torchrun_example.py | 3 +- vllm/config.py | 1 - .../device_communicators/custom_all_reduce.py | 6 ++-- vllm/engine/arg_utils.py | 5 +-- vllm/executor/uniproc_executor.py | 3 -- vllm/v1/worker/gpu_model_runner.py | 33 +++++++++++++++---- vllm/v1/worker/gpu_worker.py | 6 ++-- 9 files changed, 55 insertions(+), 27 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c4459741712d..461fb6d30c45 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -148,6 +148,8 @@ steps: # test with tp=2 and external_dp=2 - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py index c6d9e6b47e21..bb61a0a29e32 100644 --- a/examples/offline_inference/torchrun_example.py +++ b/examples/offline_inference/torchrun_example.py @@ -8,6 +8,8 @@ see `tests/distributed/test_torchrun_example.py` for the unit test. """ +import torch.distributed as dist + from vllm import LLM, SamplingParams # Create prompts, the same across all ranks @@ -27,23 +29,26 @@ # all ranks have the same random seed, so that sampling can be # deterministic across ranks. llm = LLM( - model="facebook/opt-125m", + model="meta-llama/Llama-3.1-8B", tensor_parallel_size=2, + pipeline_parallel_size=2, distributed_executor_backend="external_launcher", - seed=0, + max_model_len=32768, + seed=1, ) outputs = llm.generate(prompts, sampling_params) # all ranks will have the same outputs -print("-" * 50) -for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}\n" - f"Generated text: {generated_text!r}") +if dist.get_rank() == 0: print("-" * 50) -""" + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}\n" + f"Generated text: {generated_text!r}\n") + print("-" * 50) + """ Further tips: 1. to communicate control messages across all ranks, use the cpu group, diff --git a/tests/distributed/test_torchrun_example.py b/tests/distributed/test_torchrun_example.py index 0420a6454d46..bb38e908b734 100644 --- a/tests/distributed/test_torchrun_example.py +++ b/tests/distributed/test_torchrun_example.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # unit test for `examples/offline_inference/torchrun_example.py` - +import os import random import torch.distributed as dist @@ -25,6 +25,7 @@ # to test if all ranks agree on the same kv cache configuration. llm = LLM(model="facebook/opt-125m", tensor_parallel_size=2, + pipeline_parallel_size=int(os.getenv("PP_SIZE", 1)), distributed_executor_backend="external_launcher", gpu_memory_utilization=random.uniform(0.7, 0.9), swap_space=random.randint(1, 4), diff --git a/vllm/config.py b/vllm/config.py index dddfdabd126a..d07a1ff05234 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1695,7 +1695,6 @@ class ParallelConfig: """Port of the data parallel master.""" enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" - max_parallel_loading_workers: Optional[int] = None """Maximum number of parallel loading workers when loading model sequentially in multiple batches. To avoid RAM OOM when using tensor diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index 7567161b6ac7..5c2dbcc27b13 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -265,7 +265,8 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]: def close(self): if not self.disabled and self._ptr: - ops.dispose(self._ptr) + if ops is not None: + ops.dispose(self._ptr) self._ptr = 0 self.free_shared_buffer(self.meta_ptrs, rank=self.rank) self.free_shared_buffer(self.buffer_ptrs, rank=self.rank) @@ -298,4 +299,5 @@ def free_shared_buffer(pointers: list[int], rank: Optional[int] = 0) -> None: if rank is None: rank = dist.get_rank(group=group) - ops.free_shared_buffer(pointers[rank]) + if ops is not None: + ops.free_shared_buffer(pointers[rank]) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6fdb5e6c3772..dc2bb3a52cac 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1383,9 +1383,10 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: return False if (self.pipeline_parallel_size > 1 - and self.distributed_executor_backend not in ["ray", "mp"]): + and self.distributed_executor_backend + not in ("ray", "mp", "external_launcher")): name = "Pipeline Parallelism without Ray distributed executor " \ - "or multiprocessing executor" + "or multiprocessing executor or external launcher" _raise_or_fallback(feature_name=name, recommend_to_remove=False) return False diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 2e4b47c1e24a..1d3a6e443a80 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -86,9 +86,6 @@ class ExecutorWithExternalLauncher(UniProcExecutor): def _init_executor(self) -> None: """Initialize the worker and load the model. """ - assert self.vllm_config.parallel_config.pipeline_parallel_size == 1, \ - ("ExecutorWithExternalLauncher does not " - "support pipeline parallelism.") assert self.vllm_config.scheduler_config.delay_factor == 0.0, \ ("ExecutorWithExternalLauncher needs deterministic " "execution, so it" diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0788ac5adde8..cb802fd4f102 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -22,7 +22,8 @@ has_kv_transfer_group) from vllm.distributed.kv_transfer.kv_connector.v1 import KVConnectorBase_V1 from vllm.distributed.parallel_state import ( - get_pp_group, graph_capture, prepare_communication_buffer_for_model) + get_pp_group, get_tp_group, graph_capture, + prepare_communication_buffer_for_model) from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding @@ -1162,13 +1163,32 @@ def execute_model( hidden_states, aux_hidden_states = model_output else: hidden_states = model_output - + # Broadcast PP output for external_launcher (torchrun) + # to make sure we are synced across pp ranks + # TODO: Support overlapping mirco-batches + # https://github.com/vllm-project/vllm/issues/18019 + broadcast_pp_output = \ + self.parallel_config.distributed_executor_backend \ + == "external_launcher" and len(get_pp_group().ranks) > 0 if not get_pp_group().is_last_rank: # For mid-pipeline stages, return the hidden states. - return hidden_states - - sample_hidden_states = hidden_states[logits_indices] - logits = self.model.compute_logits(sample_hidden_states, None) + if not broadcast_pp_output: + return hidden_states + assert isinstance(hidden_states, IntermediateTensors) + get_pp_group().send_tensor_dict(hidden_states.tensors, + all_gather_group=get_tp_group()) + logits = None + else: + sample_hidden_states = hidden_states[logits_indices] + logits = self.model.compute_logits(sample_hidden_states, None) + if broadcast_pp_output: + model_output_broadcast_data = { + "logits": logits.contiguous(), + } if logits is not None else {} + model_output_broadcast_data = get_pp_group().broadcast_tensor_dict( + model_output_broadcast_data, src=len(get_pp_group().ranks) - 1) + assert model_output_broadcast_data is not None + logits = model_output_broadcast_data["logits"] # Apply structured output bitmasks if present if scheduler_output.grammar_bitmask is not None: @@ -1186,6 +1206,7 @@ def execute_model( # creates a new tensor with separate storage from the original # logits tensor. This means any in-place operations on bonus_logits # won't affect the original logits tensor. + assert logits is not None bonus_logits = logits[spec_decode_metadata.bonus_logits_indices] sampler_output = self.sampler( logits=bonus_logits, diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index d85701fa93df..93129d987940 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -275,13 +275,13 @@ def execute_model( output = self.model_runner.execute_model(scheduler_output, intermediate_tensors) - - if not get_pp_group().is_last_rank: + parallel_config = self.vllm_config.parallel_config + if parallel_config.distributed_executor_backend != "external_launcher" \ + and not get_pp_group().is_last_rank: assert isinstance(output, IntermediateTensors) get_pp_group().send_tensor_dict(output.tensors, all_gather_group=get_tp_group()) return None - assert isinstance(output, ModelRunnerOutput) return output if self.is_driver_worker else None From 5c04bb8b863bfdef8122b193631479315cc764f5 Mon Sep 17 00:00:00 2001 From: David Xia Date: Fri, 16 May 2025 02:05:34 -0400 Subject: [PATCH 48/58] [doc] fix multimodal example script (#18089) Signed-off-by: David Xia --- ...i_chat_completion_client_for_multimodal.py | 26 +++++++++++-------- examples/online_serving/utils.py | 25 ++++++++++++++++++ 2 files changed, 40 insertions(+), 11 deletions(-) create mode 100644 examples/online_serving/utils.py diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index cffd093c983a..2707d46f46e2 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 -"""An example showing how to use vLLM to serve multimodal models +"""An example showing how to use vLLM to serve multimodal models and run online serving with OpenAI client. Launch the vLLM server with the following command: @@ -12,12 +12,18 @@ --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt '{"image":2}' (audio inference with Ultravox) -vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096 +vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b \ + --max-model-len 4096 --trust-remote-code + +run the script with +python openai_chat_completion_client_for_multimodal.py --chat-type audio """ + import base64 import requests from openai import OpenAI +from utils import get_first_model from vllm.utils import FlexibleArgumentParser @@ -31,9 +37,6 @@ base_url=openai_api_base, ) -models = client.models.list() -model = models.data[0].id - def encode_base64_content_from_url(content_url: str) -> str: """Encode a content retrieved from a remote url to base64 format.""" @@ -46,7 +49,7 @@ def encode_base64_content_from_url(content_url: str) -> str: # Text-only inference -def run_text_only() -> None: +def run_text_only(model: str) -> None: chat_completion = client.chat.completions.create( messages=[{ "role": "user", @@ -61,7 +64,7 @@ def run_text_only() -> None: # Single-image input inference -def run_single_image() -> None: +def run_single_image(model: str) -> None: ## Use image url in the payload image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" @@ -117,7 +120,7 @@ def run_single_image() -> None: # Multi-image input inference -def run_multi_image() -> None: +def run_multi_image(model: str) -> None: image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" chat_completion_from_url = client.chat.completions.create( @@ -152,7 +155,7 @@ def run_multi_image() -> None: # Video input inference -def run_video() -> None: +def run_video(model: str) -> None: video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" video_base64 = encode_base64_content_from_url(video_url) @@ -208,7 +211,7 @@ def run_video() -> None: # Audio input inference -def run_audio() -> None: +def run_audio(model: str) -> None: from vllm.assets.audio import AudioAsset audio_url = AudioAsset("winning_call").url @@ -318,7 +321,8 @@ def parse_args(): def main(args) -> None: chat_type = args.chat_type - example_function_map[chat_type]() + model = get_first_model(client) + example_function_map[chat_type](model) if __name__ == "__main__": diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py new file mode 100644 index 000000000000..4826e8e20528 --- /dev/null +++ b/examples/online_serving/utils.py @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: Apache-2.0 +from openai import APIConnectionError, OpenAI +from openai.pagination import SyncPage +from openai.types.model import Model + + +def get_first_model(client: OpenAI) -> str: + """ + Get the first model from the vLLM server. + """ + try: + models: SyncPage[Model] = client.models.list() + except APIConnectionError as e: + raise RuntimeError( + "Failed to get the list of models from the vLLM server at " + f"{client.base_url} with API key {client.api_key}. Check\n" + "1. the server is running\n" + "2. the server URL is correct\n" + "3. the API key is correct") from e + + if len(models.data) == 0: + raise RuntimeError( + f"No models found on the vLLM server at {client.base_url}") + + return models.data[0].id From 67da5720d4ed2aa1f615ec812031f4f3753b3f62 Mon Sep 17 00:00:00 2001 From: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com> Date: Fri, 16 May 2025 10:31:02 +0400 Subject: [PATCH 49/58] [PERF] Speed up Qwen2.5-VL model by speed up rotary position embedding (#17973) Signed-off-by: Vadim Gimpelson --- vllm/model_executor/models/qwen2_5_vl.py | 204 ++++++++++++++--------- 1 file changed, 121 insertions(+), 83 deletions(-) diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py index 5904ad1f1f24..68dd07820189 100644 --- a/vllm/model_executor/models/qwen2_5_vl.py +++ b/vllm/model_executor/models/qwen2_5_vl.py @@ -25,7 +25,7 @@ # limitations under the License. """Inference-only Qwen2.5-VL model compatible with HuggingFace weights.""" from collections.abc import Iterable, Mapping -from functools import partial +from functools import lru_cache, partial from typing import Callable, Literal, Optional, TypedDict, Union import torch @@ -478,8 +478,8 @@ def __init__(self, dim: int, theta: float = 10000.0) -> None: super().__init__() self.dim = dim self.theta = theta - inv_freq = 1.0 / (theta - **(torch.arange(0, dim, 2, dtype=torch.float) / dim)) + inv_freq = 1.0 / (theta**( + torch.arange(0, dim, 2, dtype=torch.float, device='cpu') / dim)) self.register_buffer("inv_freq", inv_freq, persistent=False) self._seq_len_cached = 0 self._freqs_cached = None @@ -520,7 +520,7 @@ def __init__( self.hidden_size = vision_config.hidden_size self.num_heads = vision_config.num_heads - # args for get_window_index + # args for get_window_index_thw self.window_size = vision_config.window_size self.patch_size = vision_config.patch_size self.spatial_merge_size = vision_config.spatial_merge_size @@ -567,65 +567,71 @@ def dtype(self) -> torch.dtype: def device(self) -> torch.device: return self.patch_embed.proj.weight.device - def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: - pos_ids = [] - for t, h, w in grid_thw: - hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) - wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) - hpos_ids = hpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - wpos_ids = wpos_ids.reshape( - h // self.spatial_merge_size, - self.spatial_merge_size, - w // self.spatial_merge_size, - self.spatial_merge_size, - ).permute(0, 2, 1, 3).flatten() - pos_ids.append( - torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)) - pos_ids = torch.cat(pos_ids, dim=0) - max_grid_size = grid_thw[:, 1:].max() - rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) + def rotary_pos_emb_thw(self, t, h, w): + hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w) + wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1) + hpos_ids = hpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + wpos_ids = wpos_ids.reshape( + h // self.spatial_merge_size, + self.spatial_merge_size, + w // self.spatial_merge_size, + self.spatial_merge_size, + ).permute(0, 2, 1, 3).flatten() + pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1) + max_size = max(h, w) + rotary_pos_emb_full = self.rotary_pos_emb(max_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1) + rotary_pos_emb = rotary_pos_emb.reshape( + rotary_pos_emb.shape[0] // self.spatial_merge_unit, + self.spatial_merge_unit, -1) + return rotary_pos_emb - def get_window_index(self, grid_thw): - window_index: list = [] - cu_window_seqlens: list = [0] - window_index_id = 0 + def get_window_index_thw(self, grid_t, grid_h, grid_w): vit_merger_window_size = (self.window_size // self.spatial_merge_size // self.patch_size) - for grid_t, grid_h, grid_w in grid_thw: - llm_grid_h = grid_h // self.spatial_merge_size - llm_grid_w = grid_w // self.spatial_merge_size - index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( - grid_t, llm_grid_h, llm_grid_w) - pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size - pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size - num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size - num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size - index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) - index_padded = index_padded.reshape(grid_t, num_windows_h, - vit_merger_window_size, - num_windows_w, - vit_merger_window_size) - index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( - grid_t, num_windows_h * num_windows_w, vit_merger_window_size, - vit_merger_window_size) - seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) - index_padded = index_padded.reshape(-1) - index_new = index_padded[index_padded != -100] - window_index.append(index_new + window_index_id) - cu_seqlens_tmp = seqlens.cumsum( - 0) * self.spatial_merge_unit + cu_window_seqlens[-1] - cu_window_seqlens.extend(cu_seqlens_tmp.tolist()) - window_index_id += (grid_t * llm_grid_h * llm_grid_w).item() - window_index = torch.cat(window_index, dim=0) - return window_index, cu_window_seqlens + llm_grid_h = grid_h // self.spatial_merge_size + llm_grid_w = grid_w // self.spatial_merge_size + index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape( + grid_t, llm_grid_h, llm_grid_w) + pad_h = vit_merger_window_size - llm_grid_h % vit_merger_window_size + pad_w = vit_merger_window_size - llm_grid_w % vit_merger_window_size + num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size + num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size + index_padded = F.pad(index, (0, pad_w, 0, pad_h), 'constant', -100) + index_padded = index_padded.reshape(grid_t, num_windows_h, + vit_merger_window_size, + num_windows_w, + vit_merger_window_size) + index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape( + grid_t, num_windows_h * num_windows_w, vit_merger_window_size, + vit_merger_window_size) + seqlens = (index_padded != -100).sum([2, 3]).reshape(-1) + index_padded = index_padded.reshape(-1) + index_new = index_padded[index_padded != -100] + cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_seqlens_tmp = cu_seqlens_tmp.to(dtype=torch.int32) + cu_seqlens_tmp = torch.unique_consecutive(cu_seqlens_tmp) + + return index_new, cu_seqlens_tmp + + @lru_cache(maxsize=1024) # noqa: B019 + def get_rope_by_thw(self, t, h, w): + window_index_thw, cu_seqlens_window_thw = self.get_window_index_thw( + t, h, w) + rotary_pos_emb_thw = self.rotary_pos_emb_thw(t, h, w) + rotary_pos_emb_thw = rotary_pos_emb_thw[window_index_thw, :, :] + rotary_pos_emb_thw = rotary_pos_emb_thw.flatten(start_dim=0, end_dim=1) + cu_seqlens_thw = torch.repeat_interleave( + torch.tensor([h * w], dtype=torch.int32), t) + return (rotary_pos_emb_thw, window_index_thw, cu_seqlens_window_thw, + cu_seqlens_thw) def compute_attn_mask_seqlen( self, @@ -641,45 +647,74 @@ def compute_attn_mask_seqlen( def forward( self, x: torch.Tensor, - grid_thw: torch.Tensor, + grid_thw: list[list[int]], ) -> torch.Tensor: # patchify + seq_len, _ = x.size() + rotary_pos_emb = [] + window_index: list = [] + cu_window_seqlens: list = [torch.tensor([0], dtype=torch.int32)] + cu_seqlens: list = [] + hidden_states = x.to(device=self.device, dtype=self.dtype) hidden_states = self.patch_embed(hidden_states) - # compute position embedding - rotary_pos_emb = self.rot_pos_emb(grid_thw) + window_index_id = 0 + cu_window_seqlens_last = 0 + for t, h, w in grid_thw: + t, h, w = int(t), int(h), int(w) + llm_h = h // self.spatial_merge_size + llm_w = w // self.spatial_merge_size + + ( + rotary_pos_emb_thw, + window_index_thw, + cu_seqlens_window_thw, + cu_seqlens_thw, + ) = self.get_rope_by_thw(t, h, w) + + window_index.append(window_index_thw + window_index_id) + window_index_id += (t * llm_h * llm_w) + + cu_seqlens_window_thw = (cu_seqlens_window_thw + + cu_window_seqlens_last) + cu_window_seqlens_last = cu_seqlens_window_thw[-1] + cu_window_seqlens.append(cu_seqlens_window_thw) - # windows attention - window_index, cu_window_seqlens = self.get_window_index(grid_thw) - cu_window_seqlens = torch.tensor( - cu_window_seqlens, - device=hidden_states.device, - dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32) + rotary_pos_emb.append(rotary_pos_emb_thw) + + cu_seqlens.append(cu_seqlens_thw) + + rotary_pos_emb = torch.cat(rotary_pos_emb) + window_index = torch.cat(window_index) + cu_window_seqlens = torch.cat(cu_window_seqlens) cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens) - seq_len, _ = hidden_states.size() - hidden_states = hidden_states.reshape( - seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) - hidden_states = hidden_states[window_index, :, :] - hidden_states = hidden_states.reshape(seq_len, -1) - rotary_pos_emb = rotary_pos_emb.reshape( - seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) - rotary_pos_emb = rotary_pos_emb[window_index, :, :] - rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) - # compute cu_seqlens - cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, 0]).cumsum( - dim=0, dtype=torch.int32) + cu_seqlens = torch.cat(cu_seqlens) + cu_seqlens = torch.cumsum(cu_seqlens, dim=0, dtype=torch.int32) cu_seqlens = F.pad(cu_seqlens, (1, 0), "constant", 0) # transformers - hidden_states = hidden_states.unsqueeze(1) - # pre-compute seqlens for window/full attn to reduce cuMemcpy operations max_seqlen_full, seqlens_full = self.compute_attn_mask_seqlen( cu_seqlens) max_seqlen_window, seqlens_window = self.compute_attn_mask_seqlen( cu_window_seqlens) + + cu_seqlens = cu_seqlens.to(device=self.device, non_blocking=True) + cu_window_seqlens = cu_window_seqlens.to(device=self.device, + non_blocking=True) + rotary_pos_emb = rotary_pos_emb.to(device=self.device, + non_blocking=True) + window_index = window_index.to(device=hidden_states.device, + non_blocking=True) + + hidden_states = hidden_states.reshape( + seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1) + hidden_states = hidden_states[window_index, :, :] + hidden_states = hidden_states.reshape(seq_len, -1) + + hidden_states = hidden_states.unsqueeze(1) + for layer_num, blk in enumerate(self.blocks): if layer_num in self.fullatt_block_indexes: cu_seqlens_now = cu_seqlens @@ -932,12 +967,13 @@ def _process_image_input( grid_thw = image_input["image_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if image_input["type"] == "image_embeds": image_embeds = image_input["image_embeds"].type(self.visual.dtype) else: pixel_values = image_input["pixel_values"].type(self.visual.dtype) - image_embeds = self.visual(pixel_values, grid_thw=grid_thw) + image_embeds = self.visual(pixel_values, grid_thw=grid_thw_list) # Split concatenated embeddings for each image item. merge_size = self.visual.spatial_merge_size @@ -951,13 +987,15 @@ def _process_video_input( grid_thw = video_input["video_grid_thw"] assert grid_thw.ndim == 2 + grid_thw_list = grid_thw.tolist() if video_input["type"] == "video_embeds": video_embeds = video_input["video_embeds"].type(self.visual.dtype) else: pixel_values_videos = video_input["pixel_values_videos"].type( self.visual.dtype) - video_embeds = self.visual(pixel_values_videos, grid_thw=grid_thw) + video_embeds = self.visual(pixel_values_videos, + grid_thw=grid_thw_list) # Split concatenated embeddings for each video item. merge_size = self.visual.spatial_merge_size From 541817670cfa6101b135cb12428bd8f875364432 Mon Sep 17 00:00:00 2001 From: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com> Date: Fri, 16 May 2025 03:02:42 -0500 Subject: [PATCH 50/58] [Misc] Add Ray Prometheus logger to V1 (#17925) Signed-off-by: Seiji Eicher --- tests/v1/metrics/test_ray_metrics.py | 57 +++++++++++++ vllm/v1/metrics/loggers.py | 54 ++++++------ vllm/v1/metrics/ray_wrappers.py | 120 +++++++++++++++++++++++++++ vllm/v1/spec_decode/metrics.py | 27 +++--- 4 files changed, 223 insertions(+), 35 deletions(-) create mode 100644 tests/v1/metrics/test_ray_metrics.py create mode 100644 vllm/v1/metrics/ray_wrappers.py diff --git a/tests/v1/metrics/test_ray_metrics.py b/tests/v1/metrics/test_ray_metrics.py new file mode 100644 index 000000000000..02475f7c150b --- /dev/null +++ b/tests/v1/metrics/test_ray_metrics.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 +import pytest +import ray + +from vllm.sampling_params import SamplingParams +from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM +from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger + + +@pytest.fixture(scope="function", autouse=True) +def use_v1_only(monkeypatch): + """ + The change relies on V1 APIs, so set VLLM_USE_V1=1. + """ + monkeypatch.setenv('VLLM_USE_V1', '1') + + +MODELS = [ + "distilbert/distilgpt2", +] + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [16]) +def test_engine_log_metrics_ray( + example_prompts, + model: str, + dtype: str, + max_tokens: int, +) -> None: + """ Simple smoke test, verifying this can be used without exceptions. + Need to start a Ray cluster in order to verify outputs.""" + + @ray.remote(num_gpus=1) + class EngineTestActor: + + async def run(self): + engine_args = AsyncEngineArgs( + model=model, + dtype=dtype, + disable_log_stats=False, + ) + + engine = AsyncLLM.from_engine_args( + engine_args, stat_loggers=[RayPrometheusStatLogger]) + + for i, prompt in enumerate(example_prompts): + engine.generate( + request_id=f"request-id-{i}", + prompt=prompt, + sampling_params=SamplingParams(max_tokens=max_tokens), + ) + + # Create the actor and call the async method + actor = EngineTestActor.remote() # type: ignore[attr-defined] + ray.get(actor.run.remote()) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 6ee40850beb1..2b75a3a2ecbd 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -138,6 +138,10 @@ def log_engine_initialized(self): class PrometheusStatLogger(StatLoggerBase): + _gauge_cls = prometheus_client.Gauge + _counter_cls = prometheus_client.Counter + _histogram_cls = prometheus_client.Histogram + _spec_decoding_cls = SpecDecodingProm def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): self._unregister_vllm_metrics() @@ -156,18 +160,18 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): max_model_len = vllm_config.model_config.max_model_len - self.spec_decoding_prom = SpecDecodingProm( + self.spec_decoding_prom = self._spec_decoding_cls( vllm_config.speculative_config, labelnames, labelvalues) # # Scheduler state # - self.gauge_scheduler_running = prometheus_client.Gauge( + self.gauge_scheduler_running = self._gauge_cls( name="vllm:num_requests_running", documentation="Number of requests in model execution batches.", labelnames=labelnames).labels(*labelvalues) - self.gauge_scheduler_waiting = prometheus_client.Gauge( + self.gauge_scheduler_waiting = self._gauge_cls( name="vllm:num_requests_waiting", documentation="Number of requests waiting to be processed.", labelnames=labelnames).labels(*labelvalues) @@ -175,18 +179,18 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): # # GPU cache # - self.gauge_gpu_cache_usage = prometheus_client.Gauge( + self.gauge_gpu_cache_usage = self._gauge_cls( name="vllm:gpu_cache_usage_perc", documentation="GPU KV-cache usage. 1 means 100 percent usage.", labelnames=labelnames).labels(*labelvalues) - self.counter_gpu_prefix_cache_queries = prometheus_client.Counter( + self.counter_gpu_prefix_cache_queries = self._counter_cls( name="vllm:gpu_prefix_cache_queries", documentation= "GPU prefix cache queries, in terms of number of queried tokens.", labelnames=labelnames).labels(*labelvalues) - self.counter_gpu_prefix_cache_hits = prometheus_client.Counter( + self.counter_gpu_prefix_cache_hits = self._counter_cls( name="vllm:gpu_prefix_cache_hits", documentation= "GPU prefix cache hits, in terms of number of cached tokens.", @@ -195,24 +199,24 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): # # Counters # - self.counter_num_preempted_reqs = prometheus_client.Counter( + self.counter_num_preempted_reqs = self._counter_cls( name="vllm:num_preemptions_total", documentation="Cumulative number of preemption from the engine.", labelnames=labelnames).labels(*labelvalues) - self.counter_prompt_tokens = prometheus_client.Counter( + self.counter_prompt_tokens = self._counter_cls( name="vllm:prompt_tokens_total", documentation="Number of prefill tokens processed.", labelnames=labelnames).labels(*labelvalues) - self.counter_generation_tokens = prometheus_client.Counter( + self.counter_generation_tokens = self._counter_cls( name="vllm:generation_tokens_total", documentation="Number of generation tokens processed.", labelnames=labelnames).labels(*labelvalues) self.counter_request_success: dict[FinishReason, prometheus_client.Counter] = {} - counter_request_success_base = prometheus_client.Counter( + counter_request_success_base = self._counter_cls( name="vllm:request_success_total", documentation="Count of successfully processed requests.", labelnames=labelnames + ["finished_reason"]) @@ -225,21 +229,21 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): # Histograms of counts # self.histogram_num_prompt_tokens_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:request_prompt_tokens", documentation="Number of prefill tokens processed.", buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames).labels(*labelvalues) self.histogram_num_generation_tokens_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:request_generation_tokens", documentation="Number of generation tokens processed.", buckets=build_1_2_5_buckets(max_model_len), labelnames=labelnames).labels(*labelvalues) self.histogram_iteration_tokens = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:iteration_tokens_total", documentation="Histogram of number of tokens per engine_step.", buckets=[ @@ -249,7 +253,7 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): labelnames=labelnames).labels(*labelvalues) self.histogram_max_num_generation_tokens_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:request_max_num_generation_tokens", documentation= "Histogram of maximum number of requested generation tokens.", @@ -257,14 +261,14 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): labelnames=labelnames).labels(*labelvalues) self.histogram_n_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:request_params_n", documentation="Histogram of the n request parameter.", buckets=[1, 2, 5, 10, 20], labelnames=labelnames).labels(*labelvalues) self.histogram_max_tokens_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:request_params_max_tokens", documentation="Histogram of the max_tokens request parameter.", buckets=build_1_2_5_buckets(max_model_len), @@ -274,7 +278,7 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): # Histogram of timing intervals # self.histogram_time_to_first_token = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:time_to_first_token_seconds", documentation="Histogram of time to first token in seconds.", buckets=[ @@ -285,7 +289,7 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): labelnames=labelnames).labels(*labelvalues) self.histogram_time_per_output_token = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:time_per_output_token_seconds", documentation="Histogram of time per output token in seconds.", buckets=[ @@ -299,34 +303,34 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 ] self.histogram_e2e_time_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:e2e_request_latency_seconds", documentation="Histogram of e2e request latency in seconds.", buckets=request_latency_buckets, labelnames=labelnames).labels(*labelvalues) self.histogram_queue_time_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:request_queue_time_seconds", documentation= "Histogram of time spent in WAITING phase for request.", buckets=request_latency_buckets, labelnames=labelnames).labels(*labelvalues) self.histogram_inference_time_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:request_inference_time_seconds", documentation= "Histogram of time spent in RUNNING phase for request.", buckets=request_latency_buckets, labelnames=labelnames).labels(*labelvalues) self.histogram_prefill_time_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:request_prefill_time_seconds", documentation= "Histogram of time spent in PREFILL phase for request.", buckets=request_latency_buckets, labelnames=labelnames).labels(*labelvalues) self.histogram_decode_time_request = \ - prometheus_client.Histogram( + self._histogram_cls( name="vllm:request_decode_time_seconds", documentation= "Histogram of time spent in DECODE phase for request.", @@ -343,7 +347,7 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): self.labelname_running_lora_adapters = "running_lora_adapters" self.max_lora = vllm_config.lora_config.max_loras self.gauge_lora_info = \ - prometheus_client.Gauge( + self._gauge_cls( name="vllm:lora_requests_info", documentation="Running stats on lora requests.", labelnames=[ @@ -365,7 +369,7 @@ def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo): # Info type metrics are syntactic sugar for a gauge permanently set to 1 # Since prometheus multiprocessing mode does not support Info, emulate # info here with a gauge. - info_gauge = prometheus_client.Gauge( + info_gauge = self._gauge_cls( name=name, documentation=documentation, labelnames=metrics_info.keys()).labels(**metrics_info) diff --git a/vllm/v1/metrics/ray_wrappers.py b/vllm/v1/metrics/ray_wrappers.py new file mode 100644 index 000000000000..a51c3ed7f572 --- /dev/null +++ b/vllm/v1/metrics/ray_wrappers.py @@ -0,0 +1,120 @@ +# SPDX-License-Identifier: Apache-2.0 +import time +from typing import Optional, Union + +from vllm.config import VllmConfig +from vllm.v1.metrics.loggers import PrometheusStatLogger +from vllm.v1.spec_decode.metrics import SpecDecodingProm + +try: + from ray.util import metrics as ray_metrics + from ray.util.metrics import Metric +except ImportError: + ray_metrics = None + + +class RayPrometheusMetric: + + def __init__(self): + if ray_metrics is None: + raise ImportError( + "RayPrometheusMetric requires Ray to be installed.") + + self.metric: Metric = None + + def labels(self, *labels, **labelskwargs): + if labelskwargs: + for k, v in labelskwargs.items(): + if not isinstance(v, str): + labelskwargs[k] = str(v) + + self.metric.set_default_tags(labelskwargs) + + return self + + +class RayGaugeWrapper(RayPrometheusMetric): + """Wraps around ray.util.metrics.Gauge to provide same API as + prometheus_client.Gauge""" + + def __init__(self, + name: str, + documentation: Optional[str] = "", + labelnames: Optional[list[str]] = None): + labelnames_tuple = tuple(labelnames) if labelnames else None + self.metric = ray_metrics.Gauge(name=name, + description=documentation, + tag_keys=labelnames_tuple) + + def set(self, value: Union[int, float]): + return self.metric.set(value) + + def set_to_current_time(self): + # ray metrics doesn't have set_to_current time, https://docs.ray.io/en/latest/_modules/ray/util/metrics.html + return self.metric.set(time.time()) + + +class RayCounterWrapper(RayPrometheusMetric): + """Wraps around ray.util.metrics.Counter to provide same API as + prometheus_client.Counter""" + + def __init__(self, + name: str, + documentation: Optional[str] = "", + labelnames: Optional[list[str]] = None): + labelnames_tuple = tuple(labelnames) if labelnames else None + self.metric = ray_metrics.Counter(name=name, + description=documentation, + tag_keys=labelnames_tuple) + + def inc(self, value: Union[int, float] = 1.0): + if value == 0: + return + return self.metric.inc(value) + + +class RayHistogramWrapper(RayPrometheusMetric): + """Wraps around ray.util.metrics.Histogram to provide same API as + prometheus_client.Histogram""" + + def __init__(self, + name: str, + documentation: Optional[str] = "", + labelnames: Optional[list[str]] = None, + buckets: Optional[list[float]] = None): + labelnames_tuple = tuple(labelnames) if labelnames else None + boundaries = buckets if buckets else [] + self.metric = ray_metrics.Histogram(name=name, + description=documentation, + tag_keys=labelnames_tuple, + boundaries=boundaries) + + def observe(self, value: Union[int, float]): + return self.metric.observe(value) + + +class RaySpecDecodingProm(SpecDecodingProm): + """ + RaySpecDecodingProm is used by RayMetrics to log to Ray metrics. + Provides the same metrics as SpecDecodingProm but uses Ray's + util.metrics library. + """ + + _counter_cls = RayCounterWrapper + + +class RayPrometheusStatLogger(PrometheusStatLogger): + """RayPrometheusStatLogger uses Ray metrics instead.""" + + _gauge_cls = RayGaugeWrapper + _counter_cls = RayCounterWrapper + _histogram_cls = RayHistogramWrapper + _spec_decoding_cls = RaySpecDecodingProm + + def __init__(self, vllm_config: VllmConfig, engine_index: int = 0): + super().__init__(vllm_config, engine_index) + + @staticmethod + def _unregister_vllm_metrics(): + # No-op on purpose + pass diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py index f71a59908ef3..899aa9200e85 100644 --- a/vllm/v1/spec_decode/metrics.py +++ b/vllm/v1/spec_decode/metrics.py @@ -120,24 +120,30 @@ class SpecDecodingProm: vllm:spec_decode_num_drafts[$interval] """ - def __init__(self, speculative_config: Optional[SpeculativeConfig], - labelnames: list[str], labelvalues: list[str]): + _counter_cls = prometheus_client.Counter + + def __init__( + self, + speculative_config: Optional[SpeculativeConfig], + labelnames: list[str], + labelvalues: list[str], + ): self.spec_decoding_enabled = speculative_config is not None if not self.spec_decoding_enabled: return self.counter_spec_decode_num_drafts = \ - prometheus_client.Counter( + self._counter_cls( name="vllm:spec_decode_num_drafts_total", documentation="Number of spec decoding drafts.", labelnames=labelnames).labels(*labelvalues) self.counter_spec_decode_num_draft_tokens = \ - prometheus_client.Counter( + self._counter_cls( name="vllm:spec_decode_num_draft_tokens_total", documentation="Number of draft tokens.", - labelnames=labelnames).labels(*labelvalues) + labelnames=labelnames,).labels(*labelvalues) self.counter_spec_decode_num_accepted_tokens = \ - prometheus_client.Counter( + self._counter_cls( name="vllm:spec_decode_num_accepted_tokens_total", documentation="Number of accepted tokens.", labelnames=labelnames).labels(*labelvalues) @@ -146,12 +152,13 @@ def __init__(self, speculative_config: Optional[SpeculativeConfig], num_spec_tokens = (speculative_config.num_speculative_tokens if self.spec_decoding_enabled else 0) pos_labelnames = labelnames + ["position"] - base_counter = prometheus_client.Counter( + base_counter = self._counter_cls( name="vllm:spec_decode_num_accepted_tokens_per_pos", documentation="Accepted tokens per draft position.", - labelnames=pos_labelnames) - self.counter_spec_decode_num_accepted_tokens_per_pos: \ - list[prometheus_client.Counter] = [] + labelnames=pos_labelnames, + ) + self.counter_spec_decode_num_accepted_tokens_per_pos: list[ + prometheus_client.Counter] = [] for pos in range(num_spec_tokens): pos_labelvalues = labelvalues + [str(pos)] self.counter_spec_decode_num_accepted_tokens_per_pos.append( From 390ec88905fa2c7dfbcfc5e772891e48f228bf43 Mon Sep 17 00:00:00 2001 From: Isotr0py Date: Fri, 16 May 2025 17:18:08 +0800 Subject: [PATCH 51/58] [Misc] Consolidate Audio tests into multimodal common generation tests (#18214) Signed-off-by: Isotr0py <2037008807@qq.com> --- .../multimodal/generation/test_common.py | 64 ++++++++- .../multimodal/generation/test_ultravox.py | 112 +-------------- .../generation/vlm_utils/builders.py | 133 +++++++++++++----- .../generation/vlm_utils/case_filtering.py | 8 +- .../multimodal/generation/vlm_utils/core.py | 29 ++-- .../generation/vlm_utils/custom_inputs.py | 76 +++++----- .../generation/vlm_utils/model_utils.py | 12 ++ .../generation/vlm_utils/runners.py | 37 +++-- .../multimodal/generation/vlm_utils/types.py | 26 +++- 9 files changed, 282 insertions(+), 215 deletions(-) diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index dead2edc4fa3..d51a03dfea7e 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -8,14 +8,14 @@ from pathlib import PosixPath import pytest -from transformers import (AutoModelForImageTextToText, +from transformers import (AutoModel, AutoModelForImageTextToText, AutoModelForTextToWaveform, AutoModelForVision2Seq) from vllm.platforms import current_platform from vllm.utils import identity -from ....conftest import (IMAGE_ASSETS, HfRunner, ImageTestAssets, - VideoTestAssets, VllmRunner) +from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner, + ImageTestAssets, VideoTestAssets, VllmRunner) from ....utils import (create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks) from ...utils import check_outputs_equal @@ -158,6 +158,17 @@ image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], marks=[pytest.mark.core_model, pytest.mark.cpu_model], ), + "ultravox": VLMTestInfo( + models = ["fixie-ai/ultravox-v0_5-llama-3_2-1b"], + test_type=VLMTestType.AUDIO, + prompt_formatter=lambda audio_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{audio_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 + audio_idx_to_prompt=lambda idx: "<|audio|>", + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModel, + hf_output_post_proc=model_utils.ultravox_trunc_hf_output, + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), #### Extended model tests "aria": VLMTestInfo( models=["rhymes-ai/Aria"], @@ -393,7 +404,6 @@ formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 ), limit_mm_per_prompt={"video": 4}, - runner_mm_key="videos", )], ), "llava_next_video": VLMTestInfo( @@ -706,6 +716,7 @@ def _mark_splits( # - multi-image # - image embeddings # - video +# - audio # - custom inputs @pytest.mark.parametrize( "model_type,test_case", @@ -803,6 +814,28 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs, ) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.AUDIO, + create_new_process_for_each_test=False, + )) +def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], + audio_assets: AudioTestAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") + model_test_info = VLM_TEST_SETTINGS[model_type] + runners.run_audio_test( + model_test_info=model_test_info, + test_case=test_case, + hf_runner=hf_runner, + vllm_runner=vllm_runner, + audio_assets=audio_assets, + ) + + @pytest.mark.parametrize( "model_type,test_case", get_parametrized_options( @@ -930,6 +963,29 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, ) +@pytest.mark.parametrize( + "model_type,test_case", + get_parametrized_options( + VLM_TEST_SETTINGS, + test_type=VLMTestType.AUDIO, + create_new_process_for_each_test=True, + )) +def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs, + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + audio_assets: AudioTestAssets, monkeypatch): + if model_type in REQUIRES_V0_MODELS: + monkeypatch.setenv("VLLM_USE_V1", "0") + model_test_info = VLM_TEST_SETTINGS[model_type] + runners.run_audio_test( + model_test_info=model_test_info, + test_case=test_case, + hf_runner=hf_runner, + vllm_runner=vllm_runner, + audio_assets=audio_assets, + ) + + @pytest.mark.parametrize( "model_type,test_case", get_parametrized_options( diff --git a/tests/models/multimodal/generation/test_ultravox.py b/tests/models/multimodal/generation/test_ultravox.py index 322d886a593d..2c8a06688ca0 100644 --- a/tests/models/multimodal/generation/test_ultravox.py +++ b/tests/models/multimodal/generation/test_ultravox.py @@ -1,20 +1,16 @@ # SPDX-License-Identifier: Apache-2.0 import json -from typing import Any, Optional +from typing import Any import numpy as np import pytest import pytest_asyncio -from transformers import AutoModel, AutoTokenizer +from transformers import AutoTokenizer -from vllm.multimodal.audio import resample_audio_librosa -from vllm.sequence import SampleLogprobs - -from ....conftest import AUDIO_ASSETS, AudioTestAssets, HfRunner, VllmRunner +from ....conftest import AUDIO_ASSETS, AudioTestAssets, VllmRunner from ....utils import RemoteOpenAIServer from ...registry import HF_EXAMPLE_MODELS -from ...utils import check_logprobs_close MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b" @@ -88,79 +84,6 @@ def _get_prompt(audio_count, question, placeholder): add_generation_prompt=True) -def vllm_to_hf_output(vllm_output: tuple[list[int], str, - Optional[SampleLogprobs]], - model: str): - """Sanitize vllm output to be comparable with hf output.""" - output_ids, output_str, out_logprobs = vllm_output - - tokenizer = AutoTokenizer.from_pretrained(model) - eos_token_id = tokenizer.eos_token_id - - hf_output_ids = output_ids[:] - hf_output_str = output_str - if hf_output_ids[-1] == eos_token_id: - hf_output_str = hf_output_str + tokenizer.decode(eos_token_id) - - return hf_output_ids, hf_output_str, out_logprobs - - -def run_test( - hf_runner: type[HfRunner], - vllm_runner: type[VllmRunner], - prompts_and_audios: list[tuple[str, str, AudioTuple]], - model: str, - *, - dtype: str, - max_tokens: int, - num_logprobs: int, - **kwargs, -): - """Inference result should be the same between hf and vllm.""" - model_info = HF_EXAMPLE_MODELS.find_hf_info(model) - model_info.check_available_online(on_fail="skip") - model_info.check_transformers_version(on_fail="skip") - - # NOTE: take care of the order. run vLLM first, and then run HF. - # vLLM needs a fresh new process without cuda initialization. - # if we run HF first, the cuda initialization will be done and it - # will hurt multiprocessing backend with fork method (the default method). - - with vllm_runner(model, dtype=dtype, enforce_eager=True, - **kwargs) as vllm_model: - vllm_outputs_per_audio = [ - vllm_model.generate_greedy_logprobs([vllm_prompt], - max_tokens, - num_logprobs=num_logprobs, - audios=[audio]) - for vllm_prompt, _, audio in prompts_and_audios - ] - - with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model: - hf_outputs_per_audio = [ - hf_model.generate_greedy_logprobs_limit( - [hf_prompt], - max_tokens, - num_logprobs=num_logprobs, - audios=[(resample_audio_librosa(audio[0], - orig_sr=audio[1], - target_sr=16000), 16000)]) - for _, hf_prompt, audio in prompts_and_audios - ] - - for hf_outputs, vllm_outputs in zip(hf_outputs_per_audio, - vllm_outputs_per_audio): - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=[ - vllm_to_hf_output(vllm_output, model) - for vllm_output in vllm_outputs - ], - name_0="hf", - name_1="vllm", - ) - - def run_multi_audio_test( vllm_runner: type[VllmRunner], prompts_and_audios: list[tuple[str, list[AudioTuple]]], @@ -194,35 +117,6 @@ def run_multi_audio_test( assert all(tokens for tokens, *_ in vllm_outputs) -@pytest.mark.core_model -@pytest.mark.parametrize("dtype", ["bfloat16"]) -@pytest.mark.parametrize("max_tokens", [128]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("vllm_kwargs", [ - pytest.param({}, marks=pytest.mark.cpu_model), - pytest.param(CHUNKED_PREFILL_KWARGS), -]) -def test_models(hf_runner, vllm_runner, audio_assets: AudioTestAssets, - dtype: str, max_tokens: int, num_logprobs: int, - vllm_kwargs: dict) -> None: - audio_inputs = [( - _get_prompt(1, audio, VLLM_PLACEHOLDER), - _get_prompt(1, audio, HF_PLACEHOLDER), - audio.audio_and_sample_rate, - ) for audio in audio_assets] - - run_test( - hf_runner, - vllm_runner, - audio_inputs, - MODEL_NAME, - dtype=dtype, - max_tokens=max_tokens, - num_logprobs=num_logprobs, - **vllm_kwargs, - ) - - @pytest.mark.core_model @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index e3ba955a96a6..32117c8d8dca 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -7,18 +7,21 @@ import torch +from vllm.multimodal.audio import AudioResampler from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import (rescale_video_size, resize_video, sample_frames_from_video) -from .....conftest import ImageTestAssets, VideoTestAssets -from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, +from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets +from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS, + TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER, TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT, - ImageSizeWrapper, SizeType, VLMTestInfo) + ImageSizeWrapper, PromptWithMultiModalInput, SizeType, + VLMTestInfo) -def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], - str], +def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int], + str], test_placeholder: str) -> str: """Given a prompt, replaces each test placeholder with the model-specific tag. @@ -26,7 +29,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], prompt_segments = prompt.split(test_placeholder) img_prompt = prompt_segments[0] for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1): - img_prompt += img_idx_to_prompt(placeholder_idx) + img_prompt += mm_idx_to_prompt(placeholder_idx) img_prompt += next_seg return img_prompt @@ -34,6 +37,7 @@ def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], def get_model_prompts(base_prompts: Iterable[str], img_idx_to_prompt: Optional[Callable[[int], str]], video_idx_to_prompt: Optional[Callable[[int], str]], + audio_idx_to_prompt: Optional[Callable[[int], str]], prompt_formatter: Callable[[str], str]) -> list[str]: """Given a model-agnostic base prompt and test configuration for a model(s) to be tested, update the media placeholders and apply the prompt formatting @@ -60,6 +64,11 @@ def get_model_prompts(base_prompts: Iterable[str], video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER) + if audio_idx_to_prompt: + base_prompt = replace_test_placeholder(base_prompt, + audio_idx_to_prompt, + TEST_AUDIO_PLACEHOLDER) + # Apply the prompt formatter to wrap the base prompt with # the correct media placeholders to get the model test prompt model_prompt = prompt_formatter(base_prompt) @@ -68,10 +77,11 @@ def get_model_prompts(base_prompts: Iterable[str], def build_single_image_inputs_from_test_info( - test_info: VLMTestInfo, - image_assets: ImageTestAssets, - size_wrapper: ImageSizeWrapper, - tmp_path: Optional[PosixPath] = None): + test_info: VLMTestInfo, + image_assets: ImageTestAssets, + size_wrapper: ImageSizeWrapper, + tmp_path: Optional[PosixPath] = None, +) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError( "Prompt formatter must be set to build single image inputs") @@ -79,6 +89,7 @@ def build_single_image_inputs_from_test_info( model_prompts = get_model_prompts(test_info.single_image_prompts, test_info.img_idx_to_prompt, test_info.video_idx_to_prompt, + test_info.audio_idx_to_prompt, test_info.prompt_formatter) # For models that require a local path / URL encoded in the image; export @@ -97,28 +108,32 @@ def build_single_image_inputs_from_test_info( return build_single_image_inputs(images, model_prompts, size_wrapper) -def build_single_image_inputs(images, model_prompts, - size_wrapper: ImageSizeWrapper): +def build_single_image_inputs( + images, model_prompts, + size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]: # For every image / prompt pair, get a pair containing two lists of # length size_factors, where the first contains duplicates of the model # prompt [str], and the second contains copies of the image after being # scaled by one of the size factors. # # NOTE: rescaling preserves the image aspect ratio. - return [( - [prompt for _ in size_wrapper.data], - [ - apply_image_size_scaling(image, size, size_wrapper.type) - for size in size_wrapper.data - ], - ) for image, prompt in zip(images, model_prompts)] + return [ + PromptWithMultiModalInput( + prompts=[prompt for _ in size_wrapper.data], + image_data=[ + apply_image_size_scaling(image, size, size_wrapper.type) + for size in size_wrapper.data + ], + ) for image, prompt in zip(images, model_prompts) + ] def build_multi_image_inputs_from_test_info( - test_info: VLMTestInfo, - image_assets: ImageTestAssets, - size_wrapper: ImageSizeWrapper, - tmp_path: Optional[PosixPath] = None): + test_info: VLMTestInfo, + image_assets: ImageTestAssets, + size_wrapper: ImageSizeWrapper, + tmp_path: Optional[PosixPath] = None, +) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError( "Prompt formatter must be set to build multi image inputs") @@ -126,6 +141,7 @@ def build_multi_image_inputs_from_test_info( model_prompts = get_model_prompts([test_info.multi_image_prompt], test_info.img_idx_to_prompt, test_info.video_idx_to_prompt, + test_info.audio_idx_to_prompt, test_info.prompt_formatter) if test_info.prompt_path_encoder is not None: @@ -146,15 +162,18 @@ def build_multi_image_inputs_from_test_info( ) -def build_multi_image_inputs(image_lists, model_prompts, - size_wrapper: ImageSizeWrapper): - return [( - [prompt for _ in size_wrapper.data], - [[ - apply_image_size_scaling(image, size, size_wrapper.type) - for image in images - ] for size in size_wrapper.data], - ) for images, prompt in zip(image_lists, model_prompts)] +def build_multi_image_inputs( + image_lists, model_prompts, + size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]: + return [ + PromptWithMultiModalInput( + prompts=[prompt for _ in size_wrapper.data], + image_data=[[ + apply_image_size_scaling(image, size, size_wrapper.type) + for image in images + ] for size in size_wrapper.data], + ) for images, prompt in zip(image_lists, model_prompts) + ] def build_embedding_inputs_from_test_info( @@ -177,6 +196,7 @@ def build_embedding_inputs_from_test_info( SINGLE_IMAGE_BASE_PROMPTS, test_info.img_idx_to_prompt, test_info.video_idx_to_prompt, + test_info.audio_idx_to_prompt, test_info.prompt_formatter, ) @@ -195,13 +215,14 @@ def build_video_inputs_from_test_info( video_assets: VideoTestAssets, size_wrapper: ImageSizeWrapper, num_frames: int, -): +) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError("Prompt formatter must be set to build video inputs") model_prompts = get_model_prompts( [VIDEO_BASE_PROMPT], test_info.img_idx_to_prompt, test_info.video_idx_to_prompt, + test_info.audio_idx_to_prompt, test_info.prompt_formatter, ) @@ -213,10 +234,14 @@ def build_video_inputs_from_test_info( video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size) - return [( - [prompt for _ in size_wrapper.data], - [video_scaler(video, size) for size in size_wrapper.data], - ) for video, prompt in zip(sampled_vids, model_prompts)] + return [ + PromptWithMultiModalInput( + prompts=[prompt for _ in size_wrapper.data], + video_data=[ + video_scaler(video, size) for size in size_wrapper.data + ], + ) for video, prompt in zip(sampled_vids, model_prompts) + ] def apply_image_size_scaling(image, size: Union[float, tuple[int, int]], @@ -236,3 +261,37 @@ def apply_image_size_scaling(image, size: Union[float, tuple[int, int]], # We have a list of fixed sizes return image.resize(size) raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR") + + +def build_audio_inputs_from_test_info( + test_info: VLMTestInfo, + audio_assets: AudioTestAssets, +) -> list[PromptWithMultiModalInput]: + if test_info.prompt_formatter is None: + raise ValueError("Prompt formatter must be set to build audio inputs") + model_prompts = get_model_prompts( + SINGLE_AUDIO_BASE_PROMPT, + test_info.img_idx_to_prompt, + test_info.video_idx_to_prompt, + test_info.audio_idx_to_prompt, + test_info.prompt_formatter, + ) + resampler = AudioResampler( + target_sr=16000, + method="librosa", + ) + audios = [asset.audio_and_sample_rate for asset in audio_assets] + resampled_audios = [( + resampler.resample( + audio, + orig_sr=sr, + ), + int(resampler.target_sr), + ) for audio, sr in audios] + + return [ + PromptWithMultiModalInput( + prompts=model_prompts, + audio_data=resampled_audios, + ) + ] diff --git a/tests/models/multimodal/generation/vlm_utils/case_filtering.py b/tests/models/multimodal/generation/vlm_utils/case_filtering.py index 8e825676b8f4..a5077a090b52 100644 --- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py +++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py @@ -83,7 +83,7 @@ def get_model_type_cases(model_type: str, test_info: VLMTestInfo): test_info.num_video_frames) # No sizes passed for custom inputs, since inputs are directly provided - if test_type != VLMTestType.CUSTOM_INPUTS: + if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO): wrapped_sizes = get_wrapped_test_sizes(test_info, test_type) if wrapped_sizes is None: raise ValueError( @@ -91,7 +91,7 @@ def get_model_type_cases(model_type: str, test_info: VLMTestInfo): iter_kwargs["size_wrapper"] = wrapped_sizes #Otherwise expand the custom test options instead - else: + elif test_type == VLMTestType.CUSTOM_INPUTS: if test_info.custom_test_opts is None: raise ValueError("Test has type CUSTOM_INPUTS, but none given") iter_kwargs["custom_test_opts"] = test_info.custom_test_opts @@ -136,8 +136,8 @@ def get_wrapped_test_sizes( ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor) for factor in EMBEDDING_SIZE_FACTORS ]) - # Custom inputs have preprocessed inputs - elif test_type == VLMTestType.CUSTOM_INPUTS: + # Audio and Custom inputs have preprocessed inputs + elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS): return tuple() size_factors = test_info.image_size_factors \ diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index c3d20f56855f..ccd2799abd90 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Core test implementation to be shared across modalities.""" -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Optional import torch -from PIL.Image import Image from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm.config import TaskOption @@ -11,14 +10,14 @@ from .....conftest import HfRunner, VllmRunner from ....registry import HF_EXAMPLE_MODELS -from .types import RunnerOutput +from .types import PromptWithMultiModalInput, RunnerOutput def run_test( *, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - inputs: list[tuple[list[str], list[Union[list[Image], Image]]]], + inputs: list[PromptWithMultiModalInput], model: str, dtype: str, max_tokens: int, @@ -38,7 +37,6 @@ def run_test( hf_model_kwargs: Optional[dict[str, Any]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], task: TaskOption = "auto", - runner_mm_key: str = "images", distributed_executor_backend: Optional[str] = None, tensor_parallel_size: int = 1, vllm_embeddings: Optional[torch.Tensor] = None, @@ -94,10 +92,16 @@ def run_test( if stop_str: vllm_kwargs["stop"] = stop_str - for prompts, media in vllm_inputs: - vllm_kwargs[runner_mm_key] = media + for prompts, image_data, video_data, audio_data in vllm_inputs: + mm_data = dict(images=image_data, + videos=video_data, + audios=audio_data) + vllm_kwargs_with_mm_data = vllm_kwargs | mm_data vllm_output = vllm_model.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs) + prompts, + max_tokens, + num_logprobs=num_logprobs, + **vllm_kwargs_with_mm_data) vllm_outputs_per_mm.append(vllm_output) hf_model = hf_runner(model, @@ -122,14 +126,17 @@ def run_test( if stop_str: hf_kwargs["stop_strings"] = stop_str - for prompts, media in inputs: - hf_kwargs[runner_mm_key] = media + for prompts, image_data, video_data, audio_data in inputs: + mm_data = dict(images=image_data, + videos=video_data, + audios=audio_data) + hf_kwargs_with_mm_data = hf_kwargs | mm_data hf_output = hf_model.generate_greedy_logprobs_limit( prompts, max_tokens, num_logprobs=num_logprobs, tokenizer=tokenizer, - **hf_kwargs) + **hf_kwargs_with_mm_data) hf_outputs_per_mm.append(hf_output) # Apply output processing / sanitation to the vLLM and HF runner results diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py index 235618ae547e..cc1045561138 100644 --- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py +++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py @@ -12,7 +12,7 @@ from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS from .builders import build_multi_image_inputs, build_single_image_inputs -from .types import ImageSizeWrapper, SizeType +from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]): @@ -32,24 +32,28 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]): "\nWhat is the season?", ] formatted_prompts = [formatter(prompt) for prompt in img_prompts] - - return [( - formatted_prompts, + aspect_ratio_images = [ + [stop_sign, cherry_blossom], + # Images with different sizes and aspect-ratios + [ + rescale_image_size(stop_sign, 0.1), + stop_sign, + ], [ - [stop_sign, cherry_blossom], - # Images with different sizes and aspect-ratios - [ - rescale_image_size(stop_sign, 0.1), - stop_sign, - ], - [ - stop_sign, - rescale_image_size(stop_sign, 0.25), - cherry_blossom.resize((183, 488)), - cherry_blossom.resize((488, 183)) - ], - cherry_blossom, - ])] + stop_sign, + rescale_image_size(stop_sign, 0.25), + cherry_blossom.resize((183, 488)), + cherry_blossom.resize((488, 183)) + ], + cherry_blossom, + ] + + return [ + PromptWithMultiModalInput( + prompts=formatted_prompts, + image_data=aspect_ratio_images, + ) + ] def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str], @@ -68,24 +72,28 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str], "