vllm-project · jiqing-feng · May 16, 2024 · May 24, 2024 · May 24, 2024 · May 27, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -102,16 +102,21 @@ define_gpu_extension_target(
 
 add_dependencies(default _core_C)
 
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
+    NOT VLLM_TARGET_DEVICE STREQUAL "rocm" AND
+    NOT VLLM_TARGET_DEVICE STREQUAL "cpu")
+    return()
+endif()
+
 #
-# Forward the non-CUDA device extensions to external CMake scripts.
+# The CUDA device extensions need CPU CMake scripts to support Heterogeneous Speculative Decoding.
 #
-if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
-    NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
-    if (VLLM_TARGET_DEVICE STREQUAL "cpu")
-        include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
-    else()
-        return()
-    endif()
+if (NOT HIP_FOUND AND CUDA_FOUND)
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+endif()
+
+if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+    include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
     return()
 endif()
 

diff --git a/Dockerfile b/Dockerfile
@@ -17,7 +17,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
     && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
     && apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo \
+    && apt-get install -y ccache software-properties-common git curl sudo wget numactl gcc-10 g++-10 libtcmalloc-minimal4 libnuma-dev libc6 \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update -y \
     && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
@@ -41,7 +41,6 @@ COPY requirements-cuda.txt requirements-cuda.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-cuda.txt
 
-
 # cuda arch list used by torch
 # can be useful for both `dev` and `test`
 # explicitly set the list to avoid issues with torch 2.2
@@ -83,6 +82,22 @@ ARG USE_SCCACHE
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
+ENV VLLM_CPU_DISABLE_AVX512="true"
+
+# install oneDNN
+RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
+
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \ 
+    -DONEDNN_BUILD_DOC=OFF \ 
+    -DONEDNN_BUILD_EXAMPLES=OFF \ 
+    -DONEDNN_BUILD_TESTS=OFF \ 
+    -DONEDNN_BUILD_GRAPH=OFF \ 
+    -DONEDNN_ENABLE_WORKLOAD=INFERENCE \ 
+    -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
+    cmake --build ./oneDNN/build --target install --config Release
+
 # if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/pip \
     if [ "$USE_SCCACHE" = "1" ]; then \

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
@@ -87,7 +87,7 @@ message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
 list(APPEND LIBS dnnl numa)
 
 #
-# _C extension
+# _C_cpu extension
 #
 set(VLLM_EXT_SRC
     "csrc/cpu/activation.cpp"
@@ -109,7 +109,7 @@ endif()
 #
 
 define_gpu_extension_target(
-    _C
+    _C_cpu
     DESTINATION vllm
     LANGUAGE CXX
     SOURCES ${VLLM_EXT_SRC}
@@ -120,4 +120,4 @@ define_gpu_extension_target(
 )
 
 message(STATUS "Enabling C extension.")
-add_dependencies(default _C)
+add_dependencies(default _C_cpu)
diff --git a/setup.py b/setup.py
@@ -469,6 +469,7 @@ def _read_requirements(filename: str) -> List[str]:
 
 if _build_custom_ops():
     ext_modules.append(CMakeExtension(name="vllm._C"))
+    ext_modules.append(CMakeExtension(name="vllm._C_cpu"))
 
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -17,6 +17,11 @@
     except ImportError as e:
         logger.warning("Failed to import from vllm._C with %r", e)
 
+    try:
+        import vllm._C_cpu
+    except ImportError as e:
+        logger.warning("Failed to import from vllm._C_cpu with %r", e)
+
 if current_platform.is_rocm():
     import vllm._rocm_C  # noqa: F401
 
@@ -45,27 +50,33 @@ def wrapper(*args, **kwargs):
 
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.silu_and_mul(out, x)
+    ops = torch.ops._C_cpu if x.device.type == "cpu" else torch.ops._C
+    ops.silu_and_mul(out, x)
 
 
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_and_mul(out, x)
+    ops = torch.ops._C_cpu if x.device.type == "cpu" else torch.ops._C
+    ops.gelu_and_mul(out, x)
 
 
 def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_tanh_and_mul(out, x)
+    ops = torch.ops._C_cpu if x.device.type == "cpu" else torch.ops._C
+    ops.gelu_tanh_and_mul(out, x)
 
 
 def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_fast(out, x)
+    ops = torch.ops._C_cpu if x.device.type == "cpu" else torch.ops._C
+    ops.gelu_fast(out, x)
 
 
 def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_new(out, x)
+    ops = torch.ops._C_cpu if x.device.type == "cpu" else torch.ops._C
+    ops.gelu_new(out, x)
 
 
 def gelu_quick(out: torch.Tensor, x: torch.Tensor) -> None:
-    torch.ops._C.gelu_quick(out, x)
+    ops = torch.ops._C_cpu if x.device.type == "cpu" else torch.ops._C
+    ops.gelu_quick(out, x)
 
 
 # page attention ops
@@ -90,12 +101,13 @@ def paged_attention_v1(
     blocksparse_block_size: int = 64,
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
-    torch.ops._C.paged_attention_v1(
-        out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
-        seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
-        k_scale, v_scale, tp_rank, blocksparse_local_blocks,
-        blocksparse_vert_stride, blocksparse_block_size,
-        blocksparse_head_sliding_step)
+    ops = torch.ops._C_cpu if query.device.type == "cpu" else torch.ops._C
+    ops.paged_attention_v1(out, query, key_cache, value_cache, num_kv_heads,
+                           scale, block_tables, seq_lens, block_size,
+                           max_seq_len, alibi_slopes, kv_cache_dtype, k_scale,
+                           v_scale, tp_rank, blocksparse_local_blocks,
+                           blocksparse_vert_stride, blocksparse_block_size,
+                           blocksparse_head_sliding_step)
 
 
 def paged_attention_v2(
@@ -122,12 +134,14 @@ def paged_attention_v2(
     blocksparse_block_size: int = 64,
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
-    torch.ops._C.paged_attention_v2(
-        out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
-        num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
-        alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
-        blocksparse_local_blocks, blocksparse_vert_stride,
-        blocksparse_block_size, blocksparse_head_sliding_step)
+    ops = torch.ops._C_cpu if query.device.type == "cpu" else torch.ops._C
+    ops.paged_attention_v2(out, exp_sum, max_logits, tmp_out, query, key_cache,
+                           value_cache, num_kv_heads, scale, block_tables,
+                           seq_lens, block_size, max_seq_len, alibi_slopes,
+                           kv_cache_dtype, k_scale, v_scale, tp_rank,
+                           blocksparse_local_blocks, blocksparse_vert_stride,
+                           blocksparse_block_size,
+                           blocksparse_head_sliding_step)
 
 
 def paged_attention_rocm(
@@ -163,8 +177,9 @@ def rotary_embedding(
     cos_sin_cache: torch.Tensor,
     is_neox: bool,
 ) -> None:
-    torch.ops._C.rotary_embedding(positions, query, key, head_size,
-                                  cos_sin_cache, is_neox)
+    ops = torch.ops._C_cpu if query.device.type == "cpu" else torch.ops._C
+    ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache,
+                         is_neox)
 
 
 def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
@@ -180,12 +195,14 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
 # layer norm ops
 def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
              epsilon: float) -> None:
-    torch.ops._C.rms_norm(out, input, weight, epsilon)
+    ops = torch.ops._C_cpu if input.device.type == "cpu" else torch.ops._C
+    ops.rms_norm(out, input, weight, epsilon)
 
 
 def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
                        weight: torch.Tensor, epsilon: float) -> None:
-    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
+    ops = torch.ops._C_cpu if input.device.type == "cpu" else torch.ops._C
+    ops.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
 def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int,
@@ -819,9 +836,9 @@ def reshape_and_cache(
     k_scale: float,
     v_scale: float,
 ) -> None:
-    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
-                                             value_cache, slot_mapping,
-                                             kv_cache_dtype, k_scale, v_scale)
+    ops = torch.ops._C_cpu_cache_ops if key.device.type == "cpu" else torch.ops._C_cache_ops
+    ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping,
+                          kv_cache_dtype, k_scale, v_scale)
 
 
 def reshape_and_cache_flash(
@@ -843,7 +860,9 @@ def reshape_and_cache_flash(
 def copy_blocks(key_caches: List[torch.Tensor],
                 value_caches: List[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
-    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
+    ops = torch.ops._C_cpu_cache_ops if key_caches[
+        0].device.type == "cpu" else torch.ops._C_cache_ops
+    ops.copy_blocks(key_caches, value_caches, block_mapping)
 
 
 def swap_blocks(src: torch.Tensor, dst: torch.Tensor,

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -76,9 +76,12 @@ def __init__(
         # During model initialization, the default dtype is set as the model
         # weight and activation dtype.
         dtype = torch.get_default_dtype()
+        device = None
+        if hasattr(cache_config, "cpu_kvcache_space_bytes"):
+            device = "cpu"
         attn_backend = get_attn_backend(num_heads, head_size, num_kv_heads,
                                         sliding_window, dtype, kv_cache_dtype,
-                                        block_size, blocksparse_params
+                                        block_size, device, blocksparse_params
                                         is not None)
         impl_cls = attn_backend.get_impl_cls()
         self.impl = impl_cls(num_heads, head_size, scale, num_kv_heads,

diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
@@ -95,6 +95,7 @@ def get_attn_backend(
     dtype: torch.dtype,
     kv_cache_dtype: Optional[str],
     block_size: int,
+    device=None,
     is_blocksparse: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
@@ -107,7 +108,7 @@ def get_attn_backend(
 
     backend = which_attn_to_use(num_heads, head_size, num_kv_heads,
                                 sliding_window, dtype, kv_cache_dtype,
-                                block_size)
+                                block_size, device)
     if backend == _Backend.FLASH_ATTN:
         from vllm.attention.backends.flash_attn import (  # noqa: F401
             FlashAttentionBackend)
@@ -123,7 +124,7 @@ def get_attn_backend(
             ROCmFlashAttentionBackend)
         return ROCmFlashAttentionBackend
     elif backend == _Backend.TORCH_SDPA:
-        assert is_cpu(), RuntimeError(
+        assert is_cpu() or device == "cpu", RuntimeError(
             "Torch SDPA backend is only used for the CPU device.")
         logger.info("Using Torch SDPA backend.")
         from vllm.attention.backends.torch_sdpa import TorchSDPABackend
@@ -158,6 +159,7 @@ def which_attn_to_use(
     dtype: torch.dtype,
     kv_cache_dtype: Optional[str],
     block_size: int,
+    device=None,
 ) -> _Backend:
     """Returns which flash attention backend to use."""
     # Default case.
@@ -178,7 +180,7 @@ def which_attn_to_use(
         if backend_by_env_var is not None:
             selected_backend = backend_name_to_enum(backend_by_env_var)
 
-    if is_cpu():
+    if is_cpu() or device == "cpu":
         if selected_backend != _Backend.TORCH_SDPA:
             logger.info("Cannot use %s backend on CPU.", selected_backend)
         return _Backend.TORCH_SDPA

diff --git a/vllm/config.py b/vllm/config.py
@@ -1093,6 +1093,7 @@ def maybe_create_spec_config(
         typical_acceptance_sampler_posterior_threshold: Optional[float],
         typical_acceptance_sampler_posterior_alpha: Optional[float],
         disable_logprobs: Optional[bool],
+        cpu_draft_worker: Optional[bool],
     ) -> Optional["SpeculativeConfig"]:
         """Create a SpeculativeConfig if possible, else return None.
 
@@ -1150,6 +1151,7 @@ def maybe_create_spec_config(
                 If set to False, token log probabilities are returned
                 according to the log probability settings in SamplingParams.
                 If not specified, it defaults to True.
+            cpu_draft_worker (Optional[bool]): Run draft model on CPU.
 
         Returns:
             Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
@@ -1251,7 +1253,8 @@ def maybe_create_spec_config(
             draft_parallel_config = (
                 SpeculativeConfig.create_draft_parallel_config(
                     target_parallel_config,
-                    speculative_draft_tensor_parallel_size, draft_hf_config))
+                    speculative_draft_tensor_parallel_size, draft_hf_config,
+                    cpu_draft_worker))
 
         if num_speculative_tokens is None:
             raise ValueError(
@@ -1280,6 +1283,7 @@ def maybe_create_spec_config(
                 typical_acceptance_sampler_posterior_alpha,
             disable_logprobs=disable_logprobs,
             disable_log_stats=disable_log_stats,
+            cpu_draft_worker=cpu_draft_worker,
         )
 
     @staticmethod
@@ -1322,6 +1326,7 @@ def create_draft_parallel_config(
         target_parallel_config: ParallelConfig,
         speculative_draft_tensor_parallel_size: Optional[int],
         draft_hf_config: PretrainedConfig,
+        cpu_draft_worker: Optional[bool],
     ) -> ParallelConfig:
         """Create a parallel config for use by the draft worker.
 
@@ -1374,6 +1379,7 @@ def __init__(
         typical_acceptance_sampler_posterior_alpha: float,
         disable_logprobs: bool,
         disable_log_stats: bool,
+        cpu_draft_worker: Optional[bool],
     ):
         """Create a SpeculativeConfig object.
 
@@ -1408,6 +1414,7 @@ def __init__(
                 returned.
             disable_log_stats: Whether to disable periodic printing of stage
                 times in speculative decoding.
+            cpu_draft_worker: Run draft model on CPU.
         """
         self.draft_model_config = draft_model_config
         self.draft_parallel_config = draft_parallel_config
@@ -1423,6 +1430,7 @@ def __init__(
             typical_acceptance_sampler_posterior_alpha
         self.disable_logprobs = disable_logprobs
         self.disable_log_stats = disable_log_stats
+        self.cpu_draft_worker = cpu_draft_worker or False
 
         self._verify_args()