jd-opensource
diff --git a/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 11 additions & 3 deletions b/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 5 additions & 1 deletion b/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 21 additions & 0 deletions b/‎xllm/core/framework/model/model_input_params.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎xllm/core/kernels/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎xllm/core/kernels/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎xllm/core/kernels/cuda/CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions b/‎xllm/core/kernels/cuda/CMakeLists.txt‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎xllm/core/kernels/cuda/active.cpp‎
Lines changed: 81 additions & 0 deletions b/‎xllm/core/kernels/cuda/active.cpp‎
Lines changed: 81 additions & 0 deletions
diff --git a/‎xllm/core/kernels/cuda/append_paged_kv_cache.cpp‎
Lines changed: 141 additions & 0 deletions b/‎xllm/core/kernels/cuda/append_paged_kv_cache.cpp‎
Lines changed: 141 additions & 0 deletions
@@ -212,7 +212,7 @@ void BatchInputBuilder::process_sequences_multithreaded(uint32_t start_idx,
     state_.q_seq_lens.insert(state_.q_seq_lens.end(),
                              state.q_seq_lens.begin(),
                              state.q_seq_lens.end());
-#elif defined(USE_MLU)
+#elif defined(USE_MLU) || defined(USE_CUDA)
     int32_t seq_len_offset = state_.seq_lens.back();
     // skip the first element which is 0
     for (size_t i = 1; i < state.seq_lens.size(); ++i) {
@@ -281,7 +281,7 @@ void BatchInputBuilder::process_single_sequence(
 #if defined(USE_NPU)
   state.seq_lens.push_back(seq_len);
   state.q_seq_lens.push_back(q_seq_len);
-#elif defined(USE_MLU)
+#elif defined(USE_MLU) || defined(USE_CUDA)
   state.seq_lens.push_back(state.seq_lens.back() + seq_len);
   state.q_seq_lens.push_back(state.q_seq_lens.back() + q_seq_len);
 #endif
@@ -425,7 +425,12 @@ void BatchInputBuilder::setup_kv_cache_info(
     block_size = block.size();
     block_ids.push_back(block.id());
     u_block_ids.emplace_back(block.id());
+    state.paged_kv_indices.push_back(block.id());
   }
+  state.paged_kv_indptr.push_back(state.paged_kv_indptr.back() + blocks.size());
+  int32_t last_page_len =
+      (seq_len % block_size == 0) ? block_size : seq_len % block_size;
+  state.paged_kv_last_page_len.push_back(last_page_len);
 
   int32_t kv_cache_block_idx = n_kv_cache_tokens / block_size;
   for (auto iter = block_ids.begin() + kv_cache_block_idx;
@@ -494,12 +499,15 @@ void BatchInputBuilder::padding_decode_batch_size(
 #if defined(USE_NPU)
         state_.seq_lens.push_back(num_decoding_tokens);
         state_.q_seq_lens.push_back(num_decoding_tokens);
-#elif defined(USE_MLU)
+#elif defined(USE_MLU) || defined(USE_CUDA)
         state_.seq_lens.push_back(state_.seq_lens.back() + num_decoding_tokens);
         state_.q_seq_lens.push_back(state_.q_seq_lens.back() +
                                     num_decoding_tokens);
 #endif
         state_.block_tables_vec.emplace_back();
+        state_.paged_kv_indices.push_back(0);
+        state_.paged_kv_indptr.push_back(state_.paged_kv_indptr.back() + 1);
+        state_.paged_kv_last_page_len.push_back(1);
       }
     }
   }
 
@@ -103,6 +103,11 @@ class BatchInputBuilder {
     // for continuous kvcache
     std::vector<int64_t> new_cache_slot_offsets;  //[n_tokens]
     std::vector<int64_t> kv_cache_start_offsets;  //[n_seq]
+
+    // for flashinfer
+    std::vector<int32_t> paged_kv_indptr = {0};
+    std::vector<int32_t> paged_kv_indices;
+    std::vector<int32_t> paged_kv_last_page_len;
   };
 
   // Helper methods for sequence processing
@@ -127,7 +132,6 @@ class BatchInputBuilder {
       uint32_t q_seq_len,
       BuilderState* state_ptr = nullptr,
       std::unordered_set<int32_t>* write_block_ids_ptr = nullptr);
-
   void setup_continuous_kv_cache_info(Sequence* sequence,
                                       uint32_t n_kv_cache_tokens,
                                       uint32_t seq_len,
 
@@ -92,6 +92,12 @@ struct ModelInputParams {
 
     // Copy graph_buffer to device
     params.graph_buffer = safe_to(graph_buffer, device, true);
+
+    // params for flashinfer
+    params.paged_kv_indptr = safe_to(paged_kv_indptr, device);
+    params.paged_kv_indices = safe_to(paged_kv_indices, device);
+    params.paged_kv_last_page_len = safe_to(paged_kv_last_page_len, device);
+
     return params;
   }
 
@@ -187,6 +193,21 @@ struct ModelInputParams {
   // Graph execution buffer for temporary tensor storage
   // Used by ACL Graph Executor to avoid repeated memory allocation
   torch::Tensor graph_buffer;
+
+  // the indptr of the paged kv-cache
+  // used in flashinfer
+  // IntTensor: [n_seq + 1]
+  torch::Tensor paged_kv_indptr;
+
+  // the page indices of the paged kv cache
+  // used in flashinfer
+  torch::Tensor paged_kv_indices;
+
+  // the number of entries in the last page of each request in
+  // the paged kv cache
+  // used in flashinfer
+  // IntTensor: [n_seq]
+  torch::Tensor paged_kv_last_page_len;
 };
 
 }  // namespace xllm
@@ -17,9 +17,9 @@ cc_library(
     kernels
   HDRS
     param.h
-    torch_ops_api.h
+    ops_api.h
   SRCS
-    torch_ops_api.cpp
+    ops_api.cpp
   DEPS
     torch
     $<$<BOOL:${USE_NPU}>:npu_kernels>
 
@@ -0,0 +1,20 @@
+include(cc_library)
+
+file(GLOB_RECURSE CUDA_HEADER_FILES
+  "${CMAKE_CURRENT_LIST_DIR}/*.h"
+)
+
+file(GLOB_RECURSE CUDA_SOURCE_FILES
+  "${CMAKE_CURRENT_LIST_DIR}/*.cpp"
+)
+
+cc_library(
+  NAME
+    cuda_kernels
+  HDRS
+    ${CUDA_HEADER_FILES}
+  SRCS
+    ${CUDA_SOURCE_FILES}
+  DEPS
+    flashinfer
+)
@@ -0,0 +1,81 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cuda_runtime.h>
+
+#include <flashinfer/activation.cuh>
+
+#include "cuda_ops_api.h"
+
+using namespace flashinfer;
+
+namespace xllm::kernel::cuda {
+
+__device__ __forceinline__ float silu(const float& val) {
+  return val / (1.0f + __expf(-val));
+}
+
+__device__ __forceinline__ float gelu(const float& val) {
+  constexpr float kAlpha = M_SQRT1_2;
+  return val * 0.5f * (1.0f + ::erf(val * kAlpha));
+}
+
+__device__ __forceinline__ float gelu_tanh(const float& val) {
+  const float cdf =
+      0.5f * (1.0f + math::tanh((0.7978845608028654f *
+                                 (val + 0.044715f * val * val * val))));
+  return val * cdf;
+}
+
+void act_and_mul(TensorView out,
+                 TensorView input,
+                 const std::string& act_mode,
+                 bool enable_pdl) {
+  int d = input->shape[input->ndim - 1] / 2;
+  int64_t num_tokens = input.numel() / input->shape[input->ndim - 1];
+  dim3 grid(num_tokens);
+
+  cudaSetDevice(out->device.device_id);
+  const cudaStream_t stream = get_stream(out->device);
+  DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input->dtype, c_type, [&] {
+    uint32_t vec_size = 16 / sizeof(c_type);
+    cudaLaunchConfig_t config;
+    config.gridDim = num_tokens;
+    config.blockDim = std::min(d / vec_size, 1024U);
+    config.dynamicSmemBytes = 0;
+    config.stream = stream;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+    config.numAttrs = 1;
+    config.attrs = attrs;
+
+    auto kernel = activation::act_and_mul_kernel<c_type, act_mode>;
+
+    cudaLaunchKernelEx(&config,
+                       kernel,
+                       static_cast<c_type*>(out->data),
+                       static_cast<c_type*>(input->data),
+                       d);
+
+    cudaError_t err = cudaGetLastError();
+    TVM_FFI_ICHECK(err == cudaSuccess)
+        << "Failed to launch kernel: " << cudaGetErrorString(err);
+
+    return true;
+  });
+}
+
+}  // namespace xllm::kernel::cuda
@@ -0,0 +1,141 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <flashinfer/page.cuh>
+
+#include "cuda_ops_api.h"
+
+using namespace flashinfer;
+
+using tvm::ffi::Tensor;
+
+namespace xllm::kernel::cuda {
+
+void append_paged_kv_cache(TensorView append_key,
+                           TensorView append_value,
+                           TensorView batch_indices,
+                           TensorView positions,
+                           TensorView paged_k_cache,
+                           TensorView paged_v_cache,
+                           TensorView kv_indices,
+                           TensorView kv_indptr,
+                           TensorView kv_last_page_len,
+                           int64_t layout) {
+  CHECK_LAST_DIM_CONTIGUOUS(append_key);
+  CHECK_LAST_DIM_CONTIGUOUS(append_value);
+  CHECK_INPUT(batch_indices);
+  CHECK_INPUT(positions);
+  // NOTE(Zihao): doesn't have to be contiguous
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(paged_k_cache);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(paged_v_cache);
+  CHECK_INPUT(kv_indices);
+  CHECK_INPUT(kv_indptr);
+  CHECK_INPUT(kv_last_page_len);
+  CHECK_DIM(3, append_key);
+  CHECK_DIM(3, append_value);
+  CHECK_DIM(1, batch_indices);
+  CHECK_DIM(1, positions);
+  CHECK_DIM(4, paged_k_cache);
+  CHECK_DIM(4, paged_v_cache);
+  CHECK_DIM(1, kv_indices);
+  CHECK_DIM(1, kv_indptr);
+  CHECK_DIM(1, kv_last_page_len);
+  unsigned int nnz = append_key->shape[0];
+  unsigned int batch_size = kv_last_page_len->shape[0];
+  TVM_FFI_ICHECK_EQ(kv_indptr->shape[0], batch_size + 1);
+  TVM_FFI_ICHECK_EQ(batch_indices->shape[0], nnz);
+  TVM_FFI_ICHECK_EQ(positions->shape[0], nnz);
+  CHECK_DEVICE(append_key, append_key);
+  CHECK_DEVICE(append_value, append_key);
+  CHECK_DEVICE(paged_k_cache, append_key);
+  CHECK_DEVICE(paged_v_cache, append_key);
+  CHECK_DEVICE(kv_indices, append_key);
+  CHECK_DEVICE(kv_indptr, append_key);
+  CHECK_DEVICE(kv_last_page_len, append_key);
+
+  QKVLayout kv_layout = QKVLayout(layout);
+
+  unsigned int num_heads, page_size, head_dim;
+  head_dim = paged_k_cache->shape[3];
+  if (kv_layout == QKVLayout::kHND) {
+    num_heads = paged_k_cache->shape[1];
+    page_size = paged_k_cache->shape[2];
+  } else {
+    page_size = paged_k_cache->shape[1];
+    num_heads = paged_k_cache->shape[2];
+  }
+
+  // get kv_cache_strides
+  auto k_strides = paged_k_cache->strides;
+  auto v_strides = paged_v_cache->strides;
+  auto k_dim = paged_k_cache->ndim;
+  TVM_FFI_ICHECK(std::equal(k_strides, k_strides + k_dim, v_strides))
+      << "k/v strides must be identical";
+
+  auto append_k_strides = append_key->strides;
+  auto append_k_stride_n = append_k_strides[0];
+  auto append_k_stride_h = append_k_strides[1];
+  auto append_v_strides = append_value->strides;
+  auto append_v_stride_n = append_v_strides[0];
+  auto append_v_stride_h = append_v_strides[1];
+
+  TVM_FFI_ICHECK_EQ(append_key->shape[1], num_heads);
+  TVM_FFI_ICHECK_EQ(append_key->shape[2], head_dim);
+  TVM_FFI_ICHECK_EQ(append_value->shape[1], num_heads);
+  TVM_FFI_ICHECK_EQ(append_value->shape[2], head_dim);
+
+  cudaSetDevice(append_key->device.device_id);
+  const cudaStream_t stream = get_stream(append_key->device);
+  bool success =
+      DISPATCH_DLPACK_DTYPE_TO_CTYPE(paged_k_cache->dtype, c_type, [&] {
+        paged_kv_t<c_type, int32_t> paged_kv(
+            num_heads,
+            page_size,
+            head_dim,
+            batch_size,
+            kv_layout,
+            static_cast<c_type*>(paged_k_cache->data),
+            static_cast<c_type*>(paged_v_cache->data),
+            k_strides,
+            static_cast<int32_t*>(kv_indices->data),
+            static_cast<int32_t*>(kv_indptr->data),
+            static_cast<int32_t*>(kv_last_page_len->data));
+        cudaError_t status =
+            AppendPagedKVCache(paged_kv,
+                               static_cast<c_type*>(append_key->data),
+                               static_cast<c_type*>(append_value->data),
+                               static_cast<int32_t*>(batch_indices->data),
+                               static_cast<int32_t*>(positions->data),
+                               nnz,
+                               append_k_stride_n,
+                               append_k_stride_h,
+                               append_v_stride_n,
+                               append_v_stride_h,
+                               stream);
+        TVM_FFI_ICHECK(status == cudaSuccess)
+            << "AppendPagedKVCache failed with error: "
+            << cudaGetErrorString(status);
+        return true;
+      });
+
+  TVM_FFI_ICHECK(success) << "AppendPagedKVCache failed to dispatch with dtype "
+                          << paged_k_cache->dtype;
+}
+
+//
+
+//
+
+}  // namespace xllm::kernel::cuda