diff --git a/.github/workflows/vllm_ascend_test_pr_light.yaml b/.github/workflows/vllm_ascend_test_pr_light.yaml index e2ba3566b75..4bb5006df44 100644 --- a/.github/workflows/vllm_ascend_test_pr_light.yaml +++ b/.github/workflows/vllm_ascend_test_pr_light.yaml @@ -119,9 +119,8 @@ jobs: TORCH_DEVICE_BACKEND_AUTOLOAD: 0 run: | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib + # Remove these ignores after multimodal refactor is done pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut \ - --ignore tests/ut/torchair/models/test_torchair_deepseek_mtp.py \ - --ignore tests/ut/torchair/models/test_torchair_deepseek_v2.py \ --ignore tests/ut/models/test_qwen2_vl.py \ --ignore tests/ut/models/test_qwen2_5_vl.py \ --ignore tests/ut/models/test_qwen2_5_vl_without_padding.py diff --git a/tests/ut/attention/test_mla_v1.py b/tests/ut/attention/test_mla_v1.py index 57ac54c1bd3..2083ef840d4 100644 --- a/tests/ut/attention/test_mla_v1.py +++ b/tests/ut/attention/test_mla_v1.py @@ -454,12 +454,20 @@ def setUp(self): "vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size" ) @patch("vllm_ascend.attention.mla_v1.get_ascend_config") - def test_build_prefix_no_cache_metadata(self, mock_get_ascend_config, + @patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros) + @patch("torch.Tensor.npu", new=lambda self: self) + @patch("torch.npu.is_available") + def test_build_prefix_no_cache_metadata(self, mock_npu_available, + mock_zeros, mock_get_ascend_config, mock_dcp_world_size): - if not torch.npu.is_available(): - self.skipTest("NPU not available, skipping NPU-dependent tests") + mock_npu_available.return_value = False mock_dcp_world_size.return_value = 1 + def zeros_override(*args, **kwargs): + kwargs.pop('pin_memory', None) + return mock_zeros._mock_wraps(*args, **kwargs) + + mock_zeros.side_effect = zeros_override common_attn_metadata = AscendCommonAttentionMetadata( query_start_loc=torch.tensor([0, 3, 7]), query_start_loc_cpu=torch.tensor([0, 3, 7]), @@ -506,12 +514,21 @@ def test_build_prefix_no_cache_metadata(self, mock_get_ascend_config, "vllm_ascend.attention.mla_v1.get_decode_context_model_parallel_world_size" ) @patch("vllm_ascend.attention.mla_v1.get_ascend_config") - def test_build_chunked_prefix_metadata(self, mock_get_ascend_config, + @patch("vllm_ascend.attention.mla_v1.torch.zeros", wraps=torch.zeros) + @patch("torch.Tensor.npu", new=lambda self: self) + @patch("torch.npu.is_available") + def test_build_chunked_prefix_metadata(self, mock_npu_available, + mock_zeros, mock_get_ascend_config, mock_dcp_world_size): - if not torch.npu.is_available(): - self.skipTest("NPU not available, skipping NPU-dependent tests") + mock_npu_available.return_value = False mock_dcp_world_size.return_value = 1 + def zeros_override(*args, **kwargs): + kwargs.pop('pin_memory', None) + return mock_zeros._mock_wraps(*args, **kwargs) + + mock_zeros.side_effect = zeros_override + common_attn_metadata = AscendCommonAttentionMetadata( query_start_loc=torch.tensor([0, 2, 5, 9]), query_start_loc_cpu=torch.tensor([0, 2, 5, 9]),