-
Notifications
You must be signed in to change notification settings - Fork 17
Description
Qwen/Qwen3-VL-30B-A3B-Instruct is claimed working on xpu,
But when running with following cmd, it doesn't work, tried w/wo --quantization fp8 \
what should be the right command?
VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
VLLM_WORKER_MULTIPROC_METHOD=spawn \
python3 -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen3-VL-30B-A3B-Instruct \
--served-model-name Qwen3-VL-30B-A3B-Instruct \
--dtype=float16 \
--enforce-eager \
--port 8000 \
--host 0.0.0.0 \
--trust-remote-code \
--gpu-memory-util=0.9 \
--no-enable-prefix-caching \
--max-num-batched-tokens=8192 \
--disable-log-requests \
--max-model-len=8192 \
--block-size 64 \
--quantization fp8 \
-tp=1
(EngineCore_DP0 pid=686) ERROR 02-05 02:55:25 [core.py:842] KeyError: 'layers.0.mlp.experts.w2_weight_scale_inv'
(EngineCore_DP0 pid=686) Process EngineCore_DP0:
(EngineCore_DP0 pid=686) Traceback (most recent call last):
(EngineCore_DP0 pid=686) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=686) self.run()
(EngineCore_DP0 pid=686) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=686) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 846, in run_engine_core
(EngineCore_DP0 pid=686) raise e
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 833, in run_engine_core
(EngineCore_DP0 pid=686) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 606, in init
(EngineCore_DP0 pid=686) super().init(
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 102, in init
(EngineCore_DP0 pid=686) self.model_executor = executor_class(vllm_config)
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 101, in init
(EngineCore_DP0 pid=686) self._init_executor()
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/uniproc_executor.py", line 48, in _init_executor
(EngineCore_DP0 pid=686) self.driver_worker.load_model()
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 273, in load_model
(EngineCore_DP0 pid=686) self.model_runner.load_model(eep_scale_up=eep_scale_up)
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 3275, in load_model
(EngineCore_DP0 pid=686) self.model = model_loader.load_model(
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model
(EngineCore_DP0 pid=686) self.load_weights(model, model_config)
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/default_loader.py", line 300, in load_weights
(EngineCore_DP0 pid=686) loaded_weights = model.load_weights(
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl.py", line 1663, in load_weights
(EngineCore_DP0 pid=686) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 332, in load_weights
(EngineCore_DP0 pid=686) autoloaded_weights = set(self._load_module("", self.module, weights))
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 286, in _load_module
(EngineCore_DP0 pid=686) yield from self._load_module(
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 259, in _load_module
(EngineCore_DP0 pid=686) loaded_params = module_load_weights(weights)
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py", line 831, in load_weights
(EngineCore_DP0 pid=686) return loader.load_weights(weights)
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 332, in load_weights
(EngineCore_DP0 pid=686) autoloaded_weights = set(self._load_module("", self.module, weights))
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 286, in _load_module
(EngineCore_DP0 pid=686) yield from self._load_module(
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 259, in _load_module
(EngineCore_DP0 pid=686) loaded_params = module_load_weights(weights)
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl_moe.py", line 326, in load_weights
(EngineCore_DP0 pid=686) success = self.load_fused_expert_weights(
(EngineCore_DP0 pid=686) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
(EngineCore_DP0 pid=686) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_vl_moe.py", line 140, in load_fused_expert_weights
(EngineCore_DP0 pid=686) param = params_dict[name]
(EngineCore_DP0 pid=686) ~~~~~~~~~~~^^^^^^
(EngineCore_DP0 pid=686) KeyError: 'layers.0.mlp.experts.w2_weight_scale_inv'
Loading safetensors checkpoint shards: 0% Completed | 0/4 [02:33<?, ?it/s]