feat(turbo-kv): add turbo_decode_k/v — batch dequantize for SDPA attention

solderzzc · solderzzc · commit e141627ff11a · 2026-03-30T19:51:11.000-07:00
Implements decode path: packed uint8 compressed KV history → float32
for concatenation with hot window before passing to standard SDPA.
Supports D=128 (68B/50B records) and D=256 (136B/100B records).
diff --git a/LocalPackages/mlx-swift/Source/Cmlx/include/mlx/c/fast.h b/LocalPackages/mlx-swift/Source/Cmlx/include/mlx/c/fast.h
@@ -216,6 +216,16 @@ int mlx_fast_turbo_encode(
     int k_bits,
     const mlx_stream s);
 
+int mlx_fast_turbo_decode_k(
+    mlx_array* res,
+    const mlx_array packed,
+    const mlx_stream s);
+
+int mlx_fast_turbo_decode_v(
+    mlx_array* res,
+    const mlx_array packed,
+    const mlx_stream s);
+
 int mlx_fast_prefault(mlx_array x);
 
 int mlx_fast_pread_into(
diff --git a/LocalPackages/mlx-swift/Source/Cmlx/mlx-c/mlx/c/fast.cpp b/LocalPackages/mlx-swift/Source/Cmlx/mlx-c/mlx/c/fast.cpp
@@ -814,8 +814,44 @@ extern "C" int mlx_fast_turbo_encode(
     return 0;
 }
 
+extern "C" int mlx_fast_turbo_decode_k(
+    mlx_array* res,
+    const mlx_array packed,
+    const mlx_stream s) {
+    try {
+        mlx_array_set_(
+            *res,
+            mlx::core::fast::turbo_decode_k(
+                mlx_array_get_(packed),
+                mlx_stream_get_(s)));
+    } catch (std::exception& e) {
+        mlx_error(e.what());
+        return 1;
+    }
+    return 0;
+}
+
+extern "C" int mlx_fast_turbo_decode_v(
+    mlx_array* res,
+    const mlx_array packed,
+    const mlx_stream s) {
+    try {
+        mlx_array_set_(
+            *res,
+            mlx::core::fast::turbo_decode_v(
+                mlx_array_get_(packed),
+                mlx_stream_get_(s)));
+    } catch (std::exception& e) {
+        mlx_error(e.what());
+        return 1;
+    }
+    return 0;
+}
+
+
 extern "C" int mlx_fast_prefault(
     mlx_array x) {
+
     try {
         mlx::core::prefault(mlx_array_get_(x));
     } catch (std::exception& e) {
diff --git a/LocalPackages/mlx-swift/Source/Cmlx/mlx-c/mlx/c/fast.h b/LocalPackages/mlx-swift/Source/Cmlx/mlx-c/mlx/c/fast.h
@@ -215,8 +215,19 @@ int mlx_fast_turbo_encode(
     int k_bits,
     const mlx_stream s);
 
+int mlx_fast_turbo_decode_k(
+    mlx_array* res,
+    const mlx_array packed,
+    const mlx_stream s);
+
+int mlx_fast_turbo_decode_v(
+    mlx_array* res,
+    const mlx_array packed,
+    const mlx_stream s);
+
 int mlx_fast_prefault(mlx_array x);
 
+
 // pread() directly into the already-evaluated MLX array's unified memory buffer.
 // This gives full NVMe sequential throughput without OS page-fault overhead.
 // The array MUST already be evaluated (concrete pointer exists).
diff --git a/LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/fast.cpp b/LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/fast.cpp
@@ -1054,4 +1054,91 @@ array turbo_encode_v(const array& values, StreamOrDevice s_) {
   return array(buf.data(), out_shape, uint8);
 }
 
+
+// ── TurboQuant Decode ─────────────────────────────────────────────────────────
+// Batch-decode packed uint8 compressed history back to float32 tensors.
+// Used by KVCacheSimple when routing compressed history through standard SDPA.
+// Supports head_dim=128 (record=68B K / 50B V) and head_dim=256 (2 sub-groups).
+
+array turbo_decode_k(const array& packed, StreamOrDevice s_) {
+  auto s = to_stream(s_);
+
+  const int record_bytes = static_cast<int>(packed.shape(-1));
+  if (record_bytes != TURBO_K_RECORD && record_bytes != TURBO_K_RECORD * 2) {
+    throw std::invalid_argument(
+        "[turbo_decode_k] last dim must be 68 (D=128) or 136 (D=256), got " +
+        std::to_string(record_bytes));
+  }
+  const int n_subgroups = record_bytes / TURBO_K_RECORD;
+  const int head_dim = n_subgroups * ::mlx::core::fast::TURBO_D;
+
+  // Materialise packed buffer on CPU
+  auto packed_u8 = astype(packed, uint8, s);
+  eval(packed_u8);
+  const uint8_t* src = packed_u8.data<uint8_t>();
+
+  const int N = static_cast<int>(packed_u8.size() / record_bytes);
+  std::vector<float> buf(static_cast<size_t>(N) * head_dim);
+
+  for (int i = 0; i < N; ++i) {
+    for (int g = 0; g < n_subgroups; ++g) {
+      const uint8_t* sub_src = src + i * record_bytes + g * TURBO_K_RECORD;
+      ::mlx::core::fast::TurboQuantK rec;
+      std::memset(&rec, 0, sizeof(rec));
+      std::memcpy(rec.indices,    sub_src,      48);
+      std::memcpy(rec.qjl_signs,  sub_src + 48, 16);
+      std::memcpy(&rec.norm_fp16,  sub_src + 64,  2);
+      std::memcpy(&rec.rnorm_fp16, sub_src + 66,  2);
+      ::mlx::core::fast::turbo_dequantize_k(
+          rec,
+          buf.data() + i * head_dim + g * ::mlx::core::fast::TURBO_D,
+          ::mlx::core::fast::TURBO_D);
+    }
+  }
+
+  Shape out_shape = packed.shape();
+  out_shape.back() = head_dim;
+  // Return float32; Swift caller casts to model dtype (fp16/bf16) as needed
+  return array(buf.data(), out_shape, float32);
+}
+
+array turbo_decode_v(const array& packed, StreamOrDevice s_) {
+  auto s = to_stream(s_);
+
+  const int record_bytes = static_cast<int>(packed.shape(-1));
+  if (record_bytes != TURBO_V_RECORD && record_bytes != TURBO_V_RECORD * 2) {
+    throw std::invalid_argument(
+        "[turbo_decode_v] last dim must be 50 (D=128) or 100 (D=256), got " +
+        std::to_string(record_bytes));
+  }
+  const int n_subgroups = record_bytes / TURBO_V_RECORD;
+  const int head_dim = n_subgroups * ::mlx::core::fast::TURBO_D;
+
+  auto packed_u8 = astype(packed, uint8, s);
+  eval(packed_u8);
+  const uint8_t* src = packed_u8.data<uint8_t>();
+
+  const int N = static_cast<int>(packed_u8.size() / record_bytes);
+  std::vector<float> buf(static_cast<size_t>(N) * head_dim);
+
+  for (int i = 0; i < N; ++i) {
+    for (int g = 0; g < n_subgroups; ++g) {
+      const uint8_t* sub_src = src + i * record_bytes + g * TURBO_V_RECORD;
+      ::mlx::core::fast::TurboQuantV rec;
+      std::memset(&rec, 0, sizeof(rec));
+      std::memcpy(rec.indices,   sub_src,      48);
+      std::memcpy(&rec.norm_fp16, sub_src + 48,  2);
+      ::mlx::core::fast::turbo_dequantize_v(
+          rec,
+          buf.data() + i * head_dim + g * ::mlx::core::fast::TURBO_D,
+          ::mlx::core::fast::TURBO_D);
+    }
+  }
+
+  Shape out_shape = packed.shape();
+  out_shape.back() = head_dim;
+  return array(buf.data(), out_shape, float32);
+}
+
 } // namespace mlx::core::fast
+
diff --git a/LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/fast.h b/LocalPackages/mlx-swift/Source/Cmlx/mlx/mlx/fast.h
@@ -118,4 +118,20 @@ MLX_API array turbo_encode_k(const array& keys, StreamOrDevice s = {});
  */
 MLX_API array turbo_encode_v(const array& values, StreamOrDevice s = {});
 
+/**
+ * Decode TurboKV compressed K-cache back to float32.
+ *
+ * packed: uint8 with last dim 68 (D=128) or 136 (D=256)
+ * returns: float32 array with last dim = head_dim (128 or 256)
+ */
+MLX_API array turbo_decode_k(const array& packed, StreamOrDevice s = {});
+
+/**
+ * Decode TurboKV compressed V-cache back to float32.
+ *
+ * packed: uint8 with last dim 50 (D=128) or 100 (D=256)
+ * returns: float32 array with last dim = head_dim (128 or 256)
+ */
+MLX_API array turbo_decode_v(const array& packed, StreamOrDevice s = {});
+
 } // namespace mlx::core::fast
diff --git a/LocalPackages/mlx-swift/Source/MLX/MLXFast.swift b/LocalPackages/mlx-swift/Source/MLX/MLXFast.swift
@@ -278,6 +278,30 @@ public enum MLXFast {
         return (kTuple, vTuple)
     }
 
+    /// Batch-decode TurboKV compressed key history (packed uint8) back to float32.
+    ///
+    /// - Parameter packed: `[..., 68]` uint8 for D=128, or `[..., 136]` for D=256
+    /// - Returns: `[..., headDim]` float32 — caller casts to model dtype as needed
+    public static func turboDecodeK(
+        packed: MLXArray, stream: StreamOrDevice = .default
+    ) -> MLXArray {
+        var result = mlx_array_new()
+        mlx_fast_turbo_decode_k(&result, packed.ctx, stream.ctx)
+        return MLXArray(result)
+    }
+
+    /// Batch-decode TurboKV compressed value history (packed uint8) back to float32.
+    ///
+    /// - Parameter packed: `[..., 50]` uint8 for D=128, or `[..., 100]` for D=256
+    /// - Returns: `[..., headDim]` float32 — caller casts to model dtype as needed
+    public static func turboDecodeV(
+        packed: MLXArray, stream: StreamOrDevice = .default
+    ) -> MLXArray {
+        var result = mlx_array_new()
+        mlx_fast_turbo_decode_v(&result, packed.ctx, stream.ctx)
+        return MLXArray(result)
+    }
+
     // ── SSD Flash-Stream Metrics ──────────────────────────────────────────────
 
     /// Snapshot of cumulative SSD streaming throughput stats.