diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 91a6f09fa1a..a4dd1ce06dc 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -24,7 +24,6 @@ from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, MRotaryEmbedding, RotaryEmbedding, YaRNScalingRotaryEmbedding) -from vllm.platforms import CpuArchEnum from vllm_ascend.platform import NPUPlatform from vllm_ascend.utils import (AscendDeviceType, enable_custom_op, @@ -408,10 +407,7 @@ def forward_oot( query: torch.Tensor, key: torch.Tensor, ): - # TODO: This judgment will be removed once the mrope precision issue is fixed - if self.mrope_section != [ - 16, 24, 24 - ] or NPUPlatform.get_cpu_architecture() == CpuArchEnum.X86: + if self.mrope_section != [16, 24, 24]: return super().forward_oot(positions, query, key) import torch_npu @@ -426,7 +422,7 @@ def forward_oot( self.cos_sin_cache = self.cos_sin_cache.to( # type: ignore query.dtype) # type: ignore - query, key = torch_npu.npu_mrope(positions, + query, key = torch_npu.npu_mrope(positions.contiguous(), query.contiguous(), key.contiguous(), self.cos_sin_cache.contiguous(),