From ea6ac481425ffaffcb8649c1f1d11defc14adb13 Mon Sep 17 00:00:00 2001
From: weicj <weicj520@gmail.com>
Date: Sat, 30 May 2026 00:10:06 +0800
Subject: [PATCH] feat(dflash): reduce feature mirror memory with dtype policy

---
 server/src/common/dflash_feature_ring.cpp     | 331 +++++++++++++++---
 server/src/common/dflash_feature_ring.h       |  27 +-
 .../src/qwen35/qwen35_layer_split_adapter.cpp |  27 +-
 3 files changed, 316 insertions(+), 69 deletions(-)
diff --git a/server/src/common/dflash_feature_ring.cpp b/server/src/common/dflash_feature_ring.cpp
index 8cf8fa83a..d7b03f5c7 100644
--- a/server/src/common/dflash_feature_ring.cpp
+++ b/server/src/common/dflash_feature_ring.cpp
@@ -12,7 +12,10 @@ using to_fp32_cuda_t = void (*)(const void *, float *, int64_t, cudaStream_t);
 extern "C++" to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
 
 #include <algorithm>
+#include <cstdlib>
+#include <cstring>
 #include <cstdio>
+#include <vector>
 
 #include "gpu_runtime_compat.h"
 
@@ -20,29 +23,219 @@ namespace dflash::common {
 
 // ── internal helpers ────────────────────────────────────────────
 
-static bool ensure_bf16_staging(DraftFeatureMirror & mirror, size_t elems) {
-    if (elems <= mirror.bf16_staging_elems) return true;
+static bool ensure_staging(DraftFeatureMirror & mirror, size_t bytes) {
+    if (bytes <= mirror.staging_bytes) return true;
     cudaError_t err = cudaSetDevice(mirror.device);
     if (err != cudaSuccess) return false;
-    if (mirror.bf16_staging) {
-        cudaFree(mirror.bf16_staging);
-        mirror.bf16_staging = nullptr;
-        mirror.bf16_staging_elems = 0;
+    if (mirror.staging) {
+        err = cudaFree(mirror.staging);
+        if (err != cudaSuccess) return false;
+        mirror.staging = nullptr;
+        mirror.staging_bytes = 0;
+    }
+    err = cudaMalloc(&mirror.staging, bytes);
+    if (err != cudaSuccess) return false;
+    mirror.staging_bytes = bytes;
+    return true;
+}
+
+static ggml_type parse_feature_dtype() {
+    const char * s = std::getenv("DFLASH_FEATURE_DTYPE");
+    if (!s || !s[0] || std::strcmp(s, "f32") == 0 || std::strcmp(s, "F32") == 0) {
+        return GGML_TYPE_F32;
+    }
+    if (std::strcmp(s, "f16") == 0 || std::strcmp(s, "F16") == 0) {
+        return GGML_TYPE_F16;
+    }
+    if (std::strcmp(s, "bf16") == 0 || std::strcmp(s, "BF16") == 0) {
+        return GGML_TYPE_BF16;
+    }
+    if (std::strcmp(s, "q8_0") == 0 || std::strcmp(s, "Q8_0") == 0 ||
+        std::strcmp(s, "q8") == 0 || std::strcmp(s, "Q8") == 0) {
+        return GGML_TYPE_Q8_0;
+    }
+    std::fprintf(stderr, "[dflash-feature] ignoring unsupported DFLASH_FEATURE_DTYPE=%s\n", s);
+    return GGML_TYPE_F32;
+}
+
+static bool check_feature_width_compatible(ggml_type type, int width) {
+    const int64_t blck = ggml_blck_size(type);
+    return blck > 0 && width > 0 && width % blck == 0;
+}
+
+static bool quantize_host_f32_to_feature_type(ggml_type type,
+                                              const float * src,
+                                              void * dst,
+                                              size_t elems) {
+    const auto * traits = ggml_get_type_traits(type);
+    if (!traits || !traits->from_float_ref) return false;
+    if (traits->blck_size <= 0 || elems % (size_t)traits->blck_size != 0) return false;
+    traits->from_float_ref(src, dst, (int64_t)elems);
+    return true;
+}
+
+static bool dequantize_feature_type_to_host_f32(ggml_type type,
+                                                const void * src,
+                                                float * dst,
+                                                size_t elems) {
+    const auto * traits = ggml_get_type_traits(type);
+    if (!traits || !traits->to_float) return false;
+    if (traits->blck_size <= 0 || elems % (size_t)traits->blck_size != 0) return false;
+    traits->to_float(src, dst, (int64_t)elems);
+    return true;
+}
+
+static bool host_f32_to_feature_row(ggml_type type,
+                                    const float * src,
+                                    void * dst,
+                                    size_t elems) {
+    if (type == GGML_TYPE_F32) {
+        std::memcpy(dst, src, elems * sizeof(float));
+        return true;
+    }
+    if (type == GGML_TYPE_F16) {
+        ggml_fp32_to_fp16_row(src, (ggml_fp16_t *)dst, (int64_t)elems);
+        return true;
+    }
+    if (type == GGML_TYPE_BF16) {
+        ggml_fp32_to_bf16_row(src, (ggml_bf16_t *)dst, (int64_t)elems);
+        return true;
+    }
+    return quantize_host_f32_to_feature_type(type, src, dst, elems);
+}
+
+static bool feature_row_to_host_f32(ggml_type type,
+                                    const void * src,
+                                    float * dst,
+                                    size_t elems) {
+    if (type == GGML_TYPE_F32) {
+        std::memcpy(dst, src, elems * sizeof(float));
+        return true;
+    }
+    if (type == GGML_TYPE_F16) {
+        ggml_fp16_to_fp32_row((const ggml_fp16_t *)src, dst, (int64_t)elems);
+        return true;
+    }
+    if (type == GGML_TYPE_BF16) {
+        ggml_bf16_to_fp32_row((const ggml_bf16_t *)src, dst, (int64_t)elems);
+        return true;
+    }
+    return dequantize_feature_type_to_host_f32(type, src, dst, elems);
+}
+
+static bool convert_device_f32_to_feature_type(DraftFeatureMirror & mirror,
+                                               const void * src,
+                                               int src_device,
+                                               void * dst,
+                                               size_t elems) {
+    if (mirror.storage_type == GGML_TYPE_F32) {
+        return copy_peer_async(dst, mirror.device, src, src_device,
+                               elems * sizeof(float));
+    }
+
+    std::vector<float> host(elems);
+    cudaError_t err = cudaSetDevice(src_device);
+    if (err != cudaSuccess) return false;
+    err = cudaMemcpy(host.data(), src, elems * sizeof(float),
+                     cudaMemcpyDeviceToHost);
+    if (err != cudaSuccess) return false;
+
+    const size_t row_bytes = ggml_row_size(mirror.storage_type, (int64_t)elems);
+    std::vector<uint8_t> tmp(row_bytes);
+    if (!host_f32_to_feature_row(mirror.storage_type, host.data(), tmp.data(), elems)) {
+        return false;
     }
-    err = cudaMalloc(&mirror.bf16_staging, elems * sizeof(uint16_t));
+    ggml_backend_tensor_set(mirror.target_feat, tmp.data(),
+                            (size_t)((char *)dst - (char *)mirror.target_feat->data),
+                            row_bytes);
+    return true;
+}
+
+static bool convert_bf16_feature_to_storage(DraftFeatureMirror & mirror,
+                                            const void * src,
+                                            int src_device,
+                                            void * dst,
+                                            size_t elems) {
+    if (mirror.storage_type == GGML_TYPE_BF16) {
+        return copy_peer_async(dst, mirror.device, src, src_device,
+                               elems * sizeof(ggml_bf16_t));
+    }
+
+    const size_t blck = (size_t)ggml_blck_size(mirror.storage_type);
+    if (blck == 0 || elems % blck != 0) return false;
+
+    constexpr size_t max_chunk_bytes = 4u * 1024u * 1024u;
+    const size_t max_chunk_elems =
+        std::max(blck, (max_chunk_bytes / sizeof(float) / blck) * blck);
+    const size_t dst_offset = (size_t)((char *)dst - (char *)mirror.target_feat->data);
+
+    cudaError_t err = cudaSetDevice(src_device);
     if (err != cudaSuccess) return false;
-    mirror.bf16_staging_elems = elems;
+
+    size_t done = 0;
+    size_t dst_bytes_done = 0;
+    while (done < elems) {
+        size_t chunk = std::min(elems - done, max_chunk_elems);
+        chunk = (chunk / blck) * blck;
+        if (chunk == 0) return false;
+
+        std::vector<ggml_bf16_t> bf16_host(chunk);
+        err = cudaMemcpy(bf16_host.data(),
+                         (const char *)src + done * sizeof(ggml_bf16_t),
+                         chunk * sizeof(ggml_bf16_t),
+                         cudaMemcpyDeviceToHost);
+        if (err != cudaSuccess) return false;
+
+        std::vector<float> host(chunk);
+        ggml_bf16_to_fp32_row(bf16_host.data(), host.data(), (int64_t)chunk);
+
+        const size_t chunk_bytes = ggml_row_size(mirror.storage_type, (int64_t)chunk);
+        std::vector<uint8_t> tmp(chunk_bytes);
+        if (!host_f32_to_feature_row(mirror.storage_type, host.data(), tmp.data(), chunk)) {
+            return false;
+        }
+        ggml_backend_tensor_set(mirror.target_feat, tmp.data(),
+                                dst_offset + dst_bytes_done, chunk_bytes);
+        done += chunk;
+        dst_bytes_done += chunk_bytes;
+    }
     return true;
 }
 
+static bool copy_feature_to_f32(DraftFeatureMirror & mirror,
+                                const void * src,
+                                int src_device,
+                                float * dst,
+                                size_t elems) {
+    if (mirror.storage_type == GGML_TYPE_F32) {
+        return copy_peer_async(dst, mirror.device, src, src_device,
+                               elems * sizeof(float));
+    }
+    auto to_f32 = ggml_get_to_fp32_cuda(mirror.storage_type);
+    if (!to_f32) return false;
+    const size_t src_bytes = ggml_row_size(mirror.storage_type, (int64_t)elems);
+    if (src_device != mirror.device) {
+        if (!ensure_staging(mirror, src_bytes)) return false;
+        if (!copy_peer_async(mirror.staging, mirror.device, src, src_device,
+                             src_bytes)) {
+            return false;
+        }
+        src = mirror.staging;
+    }
+    cudaError_t err = cudaSetDevice(mirror.device);
+    if (err != cudaSuccess) return false;
+    to_f32(src, dst, (int64_t)elems, nullptr);
+    return cudaGetLastError() == cudaSuccess;
+}
+
 // ── public API ──────────────────────────────────────────────────
 
 void draft_feature_mirror_free(DraftFeatureMirror & mirror) {
-    if (mirror.bf16_staging) {
-        cudaSetDevice(mirror.device);
-        cudaFree(mirror.bf16_staging);
-        mirror.bf16_staging = nullptr;
-        mirror.bf16_staging_elems = 0;
+    if (mirror.staging) {
+        (void)cudaSetDevice(mirror.device);
+        (void)cudaFree(mirror.staging);
+        mirror.staging = nullptr;
+        mirror.staging_bytes = 0;
     }
     if (mirror.buf) {
         ggml_backend_buffer_free(mirror.buf);
@@ -56,6 +249,7 @@ void draft_feature_mirror_free(DraftFeatureMirror & mirror) {
     mirror.device = 0;
     mirror.target_device = 0;
     mirror.cap = 0;
+    mirror.storage_type = GGML_TYPE_F32;
 }
 
 bool draft_feature_mirror_init(DraftFeatureMirror & mirror,
@@ -71,6 +265,14 @@ bool draft_feature_mirror_init(DraftFeatureMirror & mirror,
     mirror.target_device = target_device;
     mirror.n_target_layers = n_target_layers;
     mirror.hidden_size = hidden_size;
+    mirror.storage_type = parse_feature_dtype();
+    if (!check_feature_width_compatible(mirror.storage_type, hidden_size) ||
+        !check_feature_width_compatible(mirror.storage_type, n_target_layers * hidden_size)) {
+        std::fprintf(stderr,
+                     "[dflash-feature] unsupported mirror dtype=%s for hidden=%d layers=%d\n",
+                     ggml_type_name(mirror.storage_type), hidden_size, n_target_layers);
+        mirror.storage_type = GGML_TYPE_F32;
+    }
 
     ggml_init_params ip{};
     ip.mem_size = ggml_tensor_overhead() * 4 + 16 * 1024;
@@ -80,21 +282,27 @@ bool draft_feature_mirror_init(DraftFeatureMirror & mirror,
     if (!mirror.ctx) return false;
 
     const int fc_in = n_target_layers * hidden_size;
-    mirror.target_feat = ggml_new_tensor_2d(mirror.ctx, GGML_TYPE_F32, fc_in, cap);
+    mirror.target_feat = ggml_new_tensor_2d(mirror.ctx, mirror.storage_type, fc_in, cap);
     ggml_set_name(mirror.target_feat, "draft_target_feat_mirror");
     mirror.buf = ggml_backend_alloc_ctx_tensors(mirror.ctx, backend);
     if (!mirror.buf) {
         draft_feature_mirror_free(mirror);
         return false;
     }
-    const size_t bytes = (size_t)fc_in * (size_t)cap * sizeof(float);
-    cudaSetDevice(device);
-    cudaError_t err = cudaMemset(mirror.target_feat->data, 0, bytes);
+    const size_t bytes = ggml_nbytes(mirror.target_feat);
+    cudaError_t err = cudaSetDevice(device);
+    if (err != cudaSuccess) {
+        draft_feature_mirror_free(mirror);
+        return false;
+    }
+    err = cudaMemset(mirror.target_feat->data, 0, bytes);
     if (err != cudaSuccess) {
         draft_feature_mirror_free(mirror);
         return false;
     }
     mirror.cap = cap;
+    std::fprintf(stderr, "[dflash-feature] mirror dtype=%s cap=%d fc_in=%d\n",
+                 ggml_type_name(mirror.storage_type), cap, fc_in);
     return true;
 }
 
@@ -106,7 +314,7 @@ bool draft_feature_mirror_can_view(const DraftFeatureMirror & mirror,
     if (ctx_len <= 0 || ctx_len > mirror.cap || committed < ctx_len) return false;
     const int start = committed - ctx_len;
     slot0 = start % mirror.cap;
-    return slot0 + ctx_len <= mirror.cap;
+    return mirror.storage_type == GGML_TYPE_F32 && slot0 + ctx_len <= mirror.cap;
 }
 
 bool draft_feature_mirror_sync_range(const ggml_tensor * src_target_feat,
@@ -134,19 +342,8 @@ bool draft_feature_mirror_sync_range(const ggml_tensor * src_target_feat,
             (const char *)src_target_feat->data + (size_t)src_slot * src_stride;
         void * dst =
             (char *)mirror.target_feat->data + (size_t)dst_slot * dst_stride;
-        auto bf16_to_f32 = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
-        if (mirror.device == mirror.target_device) {
-            cudaSetDevice(mirror.device);
-            bf16_to_f32(src, (float *)dst, (int64_t)elems, nullptr);
-        } else {
-            if (!ensure_bf16_staging(mirror, elems)) return false;
-            if (!copy_peer_async(mirror.bf16_staging, mirror.device,
-                                 src, mirror.target_device,
-                                 elems * sizeof(uint16_t))) {
-                return false;
-            }
-            cudaSetDevice(mirror.device);
-            bf16_to_f32(mirror.bf16_staging, (float *)dst, (int64_t)elems, nullptr);
+        if (!convert_bf16_feature_to_storage(mirror, src, mirror.target_device, dst, elems)) {
+            return false;
         }
         cudaError_t err = cudaGetLastError();
         if (err != cudaSuccess) return false;
@@ -180,15 +377,15 @@ bool copy_capture_slice_to_draft_ring(
     const int hidden = feature_ring.hidden_size;
     const size_t dst_stride = feature_ring.target_feat->nb[1];
     const size_t src_stride = act_out->nb[1];
-    const size_t row_bytes = (size_t)hidden * sizeof(float);
+    const size_t row_elems = (size_t)hidden;
     for (int i = 0; i < n_tokens; i++) {
         const int slot = (start_pos + i) % feature_ring.cap;
         const void * src = (const char *)act_out->data +
             (size_t)(chunk_start + i) * src_stride;
         void * dst = (char *)feature_ring.target_feat->data +
             (size_t)slot * dst_stride +
-            (size_t)capture_idx * (size_t)hidden * sizeof(float);
-        if (!copy_peer_async(dst, feature_ring.device, src, src_device, row_bytes)) {
+            (size_t)capture_idx * ggml_row_size(feature_ring.storage_type, hidden);
+        if (!convert_device_f32_to_feature_type(feature_ring, src, src_device, dst, row_elems)) {
             return false;
         }
     }
@@ -214,7 +411,8 @@ bool copy_feature_ring_range_to_tensor(
         const char * src_base =
             (const char *)feature_ring.target_feat->data + (size_t)slot * src_stride;
         char * dst_base = (char *)dst->data + (size_t)done * dst_stride;
-        if (src_stride == row_bytes && dst_stride == row_bytes) {
+        if (feature_ring.storage_type == GGML_TYPE_F32 &&
+            src_stride == row_bytes && dst_stride == row_bytes) {
             if (!copy_peer_async(dst_base, feature_ring.device,
                                  src_base, feature_ring.device,
                                  row_bytes * (size_t)run)) {
@@ -222,11 +420,12 @@ bool copy_feature_ring_range_to_tensor(
             }
         } else {
             for (int i = 0; i < run; i++) {
-                if (!copy_peer_async(dst_base + (size_t)i * dst_stride,
-                                     feature_ring.device,
-                                     src_base + (size_t)i * src_stride,
-                                     feature_ring.device,
-                                     row_bytes)) {
+                if (!copy_feature_to_f32(
+                        const_cast<DraftFeatureMirror &>(feature_ring),
+                        src_base + (size_t)i * src_stride,
+                        feature_ring.device,
+                        (float *)(dst_base + (size_t)i * dst_stride),
+                        (size_t)fc_in)) {
                     return false;
                 }
             }
@@ -236,4 +435,56 @@ bool copy_feature_ring_range_to_tensor(
     return cudaDeviceSynchronize() == cudaSuccess;
 }
 
+bool copy_feature_ring_range_to_host_f32(
+    const DraftFeatureMirror & feature_ring,
+    int start_pos,
+    int n_tokens,
+    std::vector<float> & out) {
+    if (!feature_ring.target_feat || feature_ring.cap <= 0) return false;
+    if (n_tokens <= 0 || n_tokens > feature_ring.cap) return false;
+
+    const int fc_in = feature_ring.n_target_layers * feature_ring.hidden_size;
+    const size_t row_bytes = ggml_row_size(feature_ring.storage_type, fc_in);
+    const size_t src_stride = feature_ring.target_feat->nb[1];
+    std::vector<uint8_t> row(row_bytes);
+    out.resize((size_t)n_tokens * (size_t)fc_in);
+
+    for (int i = 0; i < n_tokens; ++i) {
+        const int slot = (start_pos + i) % feature_ring.cap;
+        ggml_backend_tensor_get(feature_ring.target_feat, row.data(),
+                                (size_t)slot * src_stride, row_bytes);
+        float * dst = out.data() + (size_t)i * (size_t)fc_in;
+        if (!feature_row_to_host_f32(feature_ring.storage_type, row.data(), dst, fc_in)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+bool copy_host_f32_to_feature_ring_range(
+    DraftFeatureMirror & feature_ring,
+    int start_pos,
+    int n_tokens,
+    const std::vector<float> & src) {
+    if (!feature_ring.target_feat || feature_ring.cap <= 0) return false;
+    if (n_tokens <= 0 || n_tokens > feature_ring.cap) return false;
+
+    const int fc_in = feature_ring.n_target_layers * feature_ring.hidden_size;
+    if (src.size() < (size_t)n_tokens * (size_t)fc_in) return false;
+    const size_t dst_stride = feature_ring.target_feat->nb[1];
+    const size_t row_bytes = ggml_row_size(feature_ring.storage_type, fc_in);
+    std::vector<uint8_t> row(row_bytes);
+
+    for (int i = 0; i < n_tokens; ++i) {
+        const float * src_row = src.data() + (size_t)i * (size_t)fc_in;
+        if (!host_f32_to_feature_row(feature_ring.storage_type, src_row, row.data(), fc_in)) {
+            return false;
+        }
+        const int slot = (start_pos + i) % feature_ring.cap;
+        ggml_backend_tensor_set(feature_ring.target_feat, row.data(),
+                                (size_t)slot * dst_stride, row_bytes);
+    }
+    return true;
+}
+
 }  // namespace dflash::common
diff --git a/server/src/common/dflash_feature_ring.h b/server/src/common/dflash_feature_ring.h
index e681cecfb..805618e22 100644
--- a/server/src/common/dflash_feature_ring.h
+++ b/server/src/common/dflash_feature_ring.h
@@ -1,10 +1,11 @@
 // dflash_feature_ring.h — DFlash draft feature ring buffer (target-agnostic).
 //
-// Hosts the F32 ring buffer that mirrors target hidden-state captures on the
-// draft GPU, plus the helpers that move data:
+// Hosts the ring buffer that mirrors target hidden-state captures on the
+// draft GPU, plus the helpers that move data across the storage/input dtype
+// boundary:
 //   - target activation tensor → ring slot
 //   - ring range → contiguous draft input tensor
-//   - target BF16 feature cache tensor → ring (with BF16→F32 conversion,
+//   - target BF16 feature cache tensor → ring (with dtype conversion,
 //     possibly across devices)
 //
 // Lives in common/ so any DFlash target architecture (qwen35, gemma4,
@@ -18,20 +19,22 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <vector>
 
 namespace dflash::common {
 
 struct DraftFeatureMirror {
     ggml_context * ctx = nullptr;
     ggml_backend_buffer_t buf = nullptr;
-    ggml_tensor * target_feat = nullptr; // F32 [n_target_layers*hidden_size, cap]
-    void * bf16_staging = nullptr;
-    size_t bf16_staging_elems = 0;
+    ggml_tensor * target_feat = nullptr; // [n_target_layers*hidden_size, cap]
+    void * staging = nullptr;
+    size_t staging_bytes = 0;
     int device = 0;
     int target_device = 0;
     int cap = 0;
     int n_target_layers = 0;
     int hidden_size = 0;
+    ggml_type storage_type = GGML_TYPE_F32;
 };
 
 void draft_feature_mirror_free(DraftFeatureMirror & mirror);
@@ -88,4 +91,16 @@ bool copy_feature_ring_range_to_tensor(
     int start_pos,
     int n_tokens);
 
+bool copy_feature_ring_range_to_host_f32(
+    const DraftFeatureMirror & feature_ring,
+    int start_pos,
+    int n_tokens,
+    std::vector<float> & out);
+
+bool copy_host_f32_to_feature_ring_range(
+    DraftFeatureMirror & feature_ring,
+    int start_pos,
+    int n_tokens,
+    const std::vector<float> & src);
+
 }  // namespace dflash::common
diff --git a/server/src/qwen35/qwen35_layer_split_adapter.cpp b/server/src/qwen35/qwen35_layer_split_adapter.cpp
index a9ece2c37..5ca32de50 100644
--- a/server/src/qwen35/qwen35_layer_split_adapter.cpp
+++ b/server/src/qwen35/qwen35_layer_split_adapter.cpp
@@ -303,18 +303,8 @@ bool Qwen35LayerSplitAdapter::snapshot_draft_features(int slot) {
         return remote_draft_.get_feature_range(start_pos, n_tokens, snap.data);
     }
 
-    if (!feature_ring_.target_feat) return false;
-    const int fc_in = n_layers * hidden;
-    const size_t row_bytes = (size_t)fc_in * sizeof(float);
-    const size_t src_stride = feature_ring_.target_feat->nb[1];
-    for (int i = 0; i < n_tokens; ++i) {
-        const int ring_slot = (start_pos + i) % ring_cap;
-        ggml_backend_tensor_get(feature_ring_.target_feat,
-                                snap.data.data() + (size_t)i * (size_t)fc_in,
-                                (size_t)ring_slot * src_stride,
-                                row_bytes);
-    }
-    return true;
+    return copy_feature_ring_range_to_host_f32(
+        feature_ring_, start_pos, n_tokens, snap.data);
 }
 
 void Qwen35LayerSplitAdapter::free_draft_feature_snapshot(int slot) {
@@ -354,17 +344,8 @@ bool Qwen35LayerSplitAdapter::restore_draft_features(int slot) {
         snap.hidden_size != feature_ring_.hidden_size) {
         return false;
     }
-    const int fc_in = snap.n_target_layers * snap.hidden_size;
-    const size_t row_bytes = (size_t)fc_in * sizeof(float);
-    const size_t dst_stride = feature_ring_.target_feat->nb[1];
-    for (int i = 0; i < snap.n_tokens; ++i) {
-        const int ring_slot = (snap.start_pos + i) % snap.cap;
-        ggml_backend_tensor_set(feature_ring_.target_feat,
-                                snap.data.data() + (size_t)i * (size_t)fc_in,
-                                (size_t)ring_slot * dst_stride,
-                                row_bytes);
-    }
-    return true;
+    return copy_host_f32_to_feature_ring_range(
+        feature_ring_, snap.start_pos, snap.n_tokens, snap.data);
 }
 
 int Qwen35LayerSplitAdapter::current_last_token() const {