diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
index d7c0bb681..2bdbb2199 100644
--- a/dflash/CMakeLists.txt
+++ b/dflash/CMakeLists.txt
@@ -4,9 +4,9 @@ set_property(CACHE DFLASH27B_GPU_BACKEND PROPERTY STRINGS cuda hip)
 string(TOLOWER "${DFLASH27B_GPU_BACKEND}" DFLASH27B_GPU_BACKEND)
 if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
     set(DFLASH27B_USER_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}")
-    project(dflash27b LANGUAGES C CXX CUDA)
+    project(dflash LANGUAGES C CXX CUDA)
 elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip")
-    project(dflash27b LANGUAGES C CXX HIP)
+    project(dflash LANGUAGES C CXX HIP)
 else()
     message(FATAL_ERROR "DFLASH27B_GPU_BACKEND must be 'cuda' or 'hip', got '${DFLASH27B_GPU_BACKEND}'")
 endif()
@@ -31,21 +31,21 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
 # ROCm root for HIP builds (rpath + rocwmma header discovery).
 if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
     if(DEFINED ROCM_PATH)
-        set(_dflash27b_rocm_root "${ROCM_PATH}")
+        set(_dflash_rocm_root "${ROCM_PATH}")
     elseif(DEFINED ENV{ROCM_PATH})
-        set(_dflash27b_rocm_root "$ENV{ROCM_PATH}")
+        set(_dflash_rocm_root "$ENV{ROCM_PATH}")
     elseif(EXISTS "/opt/rocm")
-        set(_dflash27b_rocm_root "/opt/rocm")
+        set(_dflash_rocm_root "/opt/rocm")
     else()
-        set(_dflash27b_rocm_root "")
+        set(_dflash_rocm_root "")
     endif()
 endif()
 # Bake portable rpath into all executables so bundled ggml backend libs / libggml-base
 # are found regardless of LD_LIBRARY_PATH or stale /usr/local/lib (closes #31).
 set(CMAKE_INSTALL_RPATH "$ORIGIN/deps/llama.cpp/ggml/src;$ORIGIN/deps/llama.cpp/ggml/src/ggml-cuda;$ORIGIN/deps/llama.cpp/ggml/src/ggml-hip;$ORIGIN/../deps/llama.cpp/ggml/src;$ORIGIN/../deps/llama.cpp/ggml/src/ggml-cuda;$ORIGIN/../deps/llama.cpp/ggml/src/ggml-hip")
-if(DFLASH27B_GPU_BACKEND STREQUAL "hip" AND _dflash27b_rocm_root)
-    list(APPEND CMAKE_BUILD_RPATH "${_dflash27b_rocm_root}/lib" "${_dflash27b_rocm_root}/lib64")
-    set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${_dflash27b_rocm_root}/lib;${_dflash27b_rocm_root}/lib64")
+if(DFLASH27B_GPU_BACKEND STREQUAL "hip" AND _dflash_rocm_root)
+    list(APPEND CMAKE_BUILD_RPATH "${_dflash_rocm_root}/lib" "${_dflash_rocm_root}/lib64")
+    set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${_dflash_rocm_root}/lib;${_dflash_rocm_root}/lib64")
 endif()
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 
@@ -114,33 +114,33 @@ if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
     # (110 on CUDA 13+) added when nvcc supports them. DGX Spark /
     # GB10 is compute capability 12.1 (121), added at CUDA 12.9+.
     if(DFLASH27B_USER_CUDA_ARCHITECTURES)
-        set(_dflash27b_archs "${DFLASH27B_USER_CUDA_ARCHITECTURES}")
+        set(_dflash_archs "${DFLASH27B_USER_CUDA_ARCHITECTURES}")
     else()
-        set(_dflash27b_archs "60;61;62;70;75;86")
+        set(_dflash_archs "60;61;62;70;75;86")
         if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8")
-            list(APPEND _dflash27b_archs "120")
+            list(APPEND _dflash_archs "120")
         endif()
         if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
-            list(APPEND _dflash27b_archs "110")
+            list(APPEND _dflash_archs "110")
         endif()
         if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9")
-            list(APPEND _dflash27b_archs "121")
+            list(APPEND _dflash_archs "121")
         endif()
     endif()
 elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip")
     # User override precedence: -DDFLASH27B_HIP_ARCHITECTURES → -DAMDGPU_TARGETS
     # → gfx1151 default (Strix Halo).
     if(DFLASH27B_HIP_ARCHITECTURES)
-        set(_dflash27b_archs "${DFLASH27B_HIP_ARCHITECTURES}")
+        set(_dflash_archs "${DFLASH27B_HIP_ARCHITECTURES}")
     elseif(AMDGPU_TARGETS)
-        set(_dflash27b_archs "${AMDGPU_TARGETS}")
+        set(_dflash_archs "${AMDGPU_TARGETS}")
     else()
-        set(_dflash27b_archs "gfx1151")
+        set(_dflash_archs "gfx1151")
     endif()
     # Make sure the HIP language picks up the resolved arch list.
-    set(CMAKE_HIP_ARCHITECTURES "${_dflash27b_archs}" CACHE STRING "" FORCE)
+    set(CMAKE_HIP_ARCHITECTURES "${_dflash_archs}" CACHE STRING "" FORCE)
 else()
-    set(_dflash27b_archs "")
+    set(_dflash_archs "")
 endif()
 
 # Consumer Blackwell workaround: skip sm_12x→sm_12xa replacement and FP4
@@ -156,9 +156,9 @@ endif()
 
 if(DFLASH27B_GPU_BACKEND STREQUAL "cuda" AND NOT DEFINED _dflash_is_consumer_blackwell)
     set(_dflash_is_consumer_blackwell OFF)
-    # Iterate the resolved dflash27b arch list, not raw CMAKE_CUDA_ARCHITECTURES,
+    # Iterate the resolved dflash_common arch list, not raw CMAKE_CUDA_ARCHITECTURES,
     # which is empty on the default path (the project supplies its own list above).
-    foreach(_arch IN LISTS _dflash27b_archs)
+    foreach(_arch IN LISTS _dflash_archs)
         string(REGEX REPLACE "[^0-9]" "" _dflash_arch_num "${_arch}")
         if(_dflash_arch_num MATCHES "^12[0-9]$")
             set(_dflash_is_consumer_blackwell ON)
@@ -196,7 +196,7 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip")
     find_package(hip REQUIRED)
 endif()
 
-# ─── dflash27b static library ──────────────────────────────────────
+# ─── dflash_common static library ──────────────────────────────────────
 
 set(DFLASH27B_SRC_INCLUDE_DIRS
     ${CMAKE_CURRENT_SOURCE_DIR}/src
@@ -209,7 +209,7 @@ set(DFLASH27B_SRC_INCLUDE_DIRS
     ${CMAKE_CURRENT_SOURCE_DIR}/src/server
 )
 
-add_library(dflash27b STATIC
+add_library(dflash_common STATIC
     src/errors.cpp
     src/qwen35/gguf_target_loader.cpp
     src/qwen35/qwen35_target_graph.cpp
@@ -283,21 +283,21 @@ endif()
 # Apply the arch list to local sources. HIP builds pass CMAKE_HIP_ARCHITECTURES
 # through to the HIP toolchain/ggml backend and optionally build rocWMMA kernels.
 if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
-    target_sources(dflash27b PRIVATE src/cuda_cross_device_copy.cpp)
-    set_target_properties(dflash27b PROPERTIES CUDA_ARCHITECTURES "${_dflash27b_archs}")
-    list(GET _dflash27b_archs 0 _dflash27b_cuda_min_sm)
+    target_sources(dflash_common PRIVATE src/cuda_cross_device_copy.cpp)
+    set_target_properties(dflash_common PROPERTIES CUDA_ARCHITECTURES "${_dflash_archs}")
+    list(GET _dflash_archs 0 _dflash_cuda_min_sm)
     # Strip any trailing 'a' suffix (e.g. "121a" -> "121")
-    string(REGEX REPLACE "[^0-9]" "" _dflash27b_cuda_min_sm "${_dflash27b_cuda_min_sm}")
-    target_compile_definitions(dflash27b PRIVATE
+    string(REGEX REPLACE "[^0-9]" "" _dflash_cuda_min_sm "${_dflash_cuda_min_sm}")
+    target_compile_definitions(dflash_common PRIVATE
         DFLASH27B_BACKEND_CUDA=1
-        DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm}
-        DFLASH27B_MIN_SM=${_dflash27b_cuda_min_sm})
+        DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm}
+        DFLASH27B_MIN_SM=${_dflash_cuda_min_sm})
 elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip")
-    set_target_properties(dflash27b PROPERTIES HIP_ARCHITECTURES "${_dflash27b_archs}")
-    target_compile_definitions(dflash27b PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
-    # hip_compat shim is needed by ALL dflash27b sources (peer_access.cpp,
+    set_target_properties(dflash_common PROPERTIES HIP_ARCHITECTURES "${_dflash_archs}")
+    target_compile_definitions(dflash_common PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
+    # hip_compat shim is needed by ALL dflash_common sources (peer_access.cpp,
     # dflash_feature_ring.cpp, flashprefill.cpp), not just the SM80_EQUIV path.
-    target_include_directories(dflash27b PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat)
+    target_include_directories(dflash_common PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat)
 endif()
 
 # FlashPrefill custom kernels.
@@ -310,11 +310,11 @@ endif()
 #   BF16 buffers → bf16 WMMA kernel; F16 buffers → f16 WMMA kernel; else → ggml FA.
 if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
     # rms_norm_hip.cu is needed by the HIP chunk-B graph path regardless of SM80_EQUIV.
-    target_sources(dflash27b PRIVATE src/rms_norm_hip.cu)
+    target_sources(dflash_common PRIVATE src/rms_norm_hip.cu)
     set_source_files_properties(src/rms_norm_hip.cu PROPERTIES LANGUAGE HIP)
     if(DFLASH27B_HIP_SM80_EQUIV)
         find_path(DFLASH27B_ROCWMMA_INCLUDE_DIR rocwmma/rocwmma.hpp
-            HINTS "${_dflash27b_rocm_root}/include" /opt/rocm/include
+            HINTS "${_dflash_rocm_root}/include" /opt/rocm/include
             NO_DEFAULT_PATH)
         if(NOT DFLASH27B_ROCWMMA_INCLUDE_DIR)
             message(FATAL_ERROR
@@ -323,16 +323,16 @@ if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
                 "or fetch headers from https://github.com/ROCm/rocWMMA), or rebuild "
                 "with -DDFLASH27B_HIP_SM80_EQUIV=OFF (uses the slower q8 fallback).")
         endif()
-        target_sources(dflash27b PRIVATE
+        target_sources(dflash_common PRIVATE
             src/flashprefill_kernels.hip.cu
             src/flashprefill_select.cpp
             src/flashprefill.cpp)
         set_source_files_properties(src/flashprefill_kernels.hip.cu
             PROPERTIES LANGUAGE HIP)
-        target_include_directories(dflash27b PRIVATE
+        target_include_directories(dflash_common PRIVATE
             ${DFLASH27B_ROCWMMA_INCLUDE_DIR}
             ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat)
-        target_compile_definitions(dflash27b PRIVATE
+        target_compile_definitions(dflash_common PRIVATE
             DFLASH27B_HAVE_FLASHPREFILL=1
             DFLASH27B_CUDA_MIN_SM=80)
         message(STATUS "dflash: HIP Phase 2 - rocWMMA flashprefill kernels enabled")
@@ -340,7 +340,7 @@ if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
         message(STATUS "dflash: HIP Phase 1 - ggml q8 fallback for flashprefill")
     endif()
 elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
-    target_sources(dflash27b PRIVATE
+    target_sources(dflash_common PRIVATE
         src/flashprefill_select.cpp
         src/flashprefill.cpp)
     # Multi-arch: scan all resolved arches and compile every applicable
@@ -349,7 +349,7 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
     set(_dflash_has_sm80 OFF)
     set(_dflash_has_sm70 OFF)
     set(_dflash_has_sm60 OFF)
-    foreach(_arch IN LISTS _dflash27b_archs)
+    foreach(_arch IN LISTS _dflash_archs)
         string(REGEX REPLACE "[^0-9]" "" _arch_num "${_arch}")
         if(_arch_num GREATER_EQUAL 80)
             set(_dflash_has_sm80 ON)
@@ -363,19 +363,19 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
     endforeach()
     # Also honour the single-arch path (min_sm) so non-multi builds keep
     # their existing behaviour when only one tier matches.
-    if(_dflash27b_cuda_min_sm GREATER_EQUAL 80)
+    if(_dflash_cuda_min_sm GREATER_EQUAL 80)
         set(_dflash_has_sm80 ON)
-    elseif(_dflash27b_cuda_min_sm GREATER_EQUAL 70)
+    elseif(_dflash_cuda_min_sm GREATER_EQUAL 70)
         set(_dflash_has_sm70 ON)
-    elseif(_dflash27b_cuda_min_sm GREATER_EQUAL 60)
+    elseif(_dflash_cuda_min_sm GREATER_EQUAL 60)
         set(_dflash_has_sm60 ON)
     endif()
 
     if(_dflash_has_sm80)
-        target_sources(dflash27b PRIVATE
+        target_sources(dflash_common PRIVATE
             src/flashprefill_kernels.cu
             src/pflash_ggml_adapter.cpp)
-        target_compile_definitions(dflash27b PRIVATE
+        target_compile_definitions(dflash_common PRIVATE
             DFLASH27B_HAVE_CUDA_WMMA_FLASHPREFILL=1
             DFLASH27B_HAVE_SM80_FLASHPREFILL=1)
         # BF16 WMMA kernels require sm_80+.  Restrict compilation to those
@@ -384,8 +384,8 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
             CUDA_ARCHITECTURES "80;86;90;100;110;120;121")
     endif()
     if(_dflash_has_sm70)
-        target_sources(dflash27b PRIVATE src/flashprefill_f16.cu)
-        target_compile_definitions(dflash27b PRIVATE
+        target_sources(dflash_common PRIVATE src/flashprefill_f16.cu)
+        target_compile_definitions(dflash_common PRIVATE
             DFLASH27B_HAVE_CUDA_WMMA_FLASHPREFILL=1
             DFLASH27B_HAVE_VOLTA_FLASHPREFILL=1)
         # F16 WMMA kernels require sm_70+.  Restrict compilation to Volta/Turing
@@ -394,8 +394,8 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
             CUDA_ARCHITECTURES "70;75")
     endif()
     if(_dflash_has_sm60)
-        target_sources(dflash27b PRIVATE src/flashprefill_scalar.cu)
-        target_compile_definitions(dflash27b PRIVATE
+        target_sources(dflash_common PRIVATE src/flashprefill_scalar.cu)
+        target_compile_definitions(dflash_common PRIVATE
             DFLASH27B_HAVE_CUDA_SCALAR_FLASHPREFILL=1
             DFLASH27B_HAVE_PASCAL_FLASHPREFILL=1)
         # Scalar Pascal kernels only target sm_60-69.
@@ -414,7 +414,7 @@ if(DFLASH27B_GPU_BACKEND STREQUAL "hip" AND DFLASH27B_ENABLE_BSA AND NOT DFLASH2
     set(DFLASH27B_ENABLE_BSA OFF)
 endif()
 if(DFLASH27B_ENABLE_BSA)
-    foreach(_arch IN LISTS _dflash27b_archs)
+    foreach(_arch IN LISTS _dflash_archs)
         if(_arch LESS 80)
             message(WARNING
                 "DFLASH27B_ENABLE_BSA=ON requested but CUDA_ARCHITECTURES contains '${_arch}' (<80); "
@@ -439,14 +439,14 @@ endif()
 if(DFLASH27B_ENABLE_BSA)
     if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
         # HIP BSA: backed by our rocWMMA sparse-FA kernel; no CUTLASS needed.
-        target_sources(dflash27b PRIVATE src/bsa_launcher_hip.cu)
+        target_sources(dflash_common PRIVATE src/bsa_launcher_hip.cu)
         set_source_files_properties(src/bsa_launcher_hip.cu PROPERTIES LANGUAGE HIP)
     else()
-        target_sources(dflash27b PRIVATE src/bsa_fwd_inst.cu src/bsa_launcher.cu)
+        target_sources(dflash_common PRIVATE src/bsa_fwd_inst.cu src/bsa_launcher.cu)
     endif()
 endif()
 
-target_include_directories(dflash27b
+target_include_directories(dflash_common
     PUBLIC
         ${CMAKE_CURRENT_SOURCE_DIR}/include
     PRIVATE
@@ -454,30 +454,30 @@ target_include_directories(dflash27b
         ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/src
 )
 if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
-    target_include_directories(dflash27b PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+    target_include_directories(dflash_common PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
 endif()
 if(DFLASH27B_ENABLE_BSA)
     if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
         # HIP path: bsa_launcher_hip.cu only needs DFLASH27B_HAVE_BSA + the
         # hip_compat shim (which the Phase 2 block above already adds).
-        target_compile_definitions(dflash27b PRIVATE DFLASH27B_HAVE_BSA=1)
+        target_compile_definitions(dflash_common PRIVATE DFLASH27B_HAVE_BSA=1)
     else()
-        target_include_directories(dflash27b PRIVATE
+        target_include_directories(dflash_common PRIVATE
             ${CMAKE_CURRENT_SOURCE_DIR}/deps/bsa_stubs
             ${CMAKE_CURRENT_SOURCE_DIR}/deps/Block-Sparse-Attention/csrc/cutlass/include
             ${CMAKE_CURRENT_SOURCE_DIR}/deps/Block-Sparse-Attention/csrc/block_sparse_attn/src)
-        target_compile_options(dflash27b PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
-        target_compile_definitions(dflash27b PRIVATE FLASHATTENTION_DISABLE_DROPOUT FLASH_NAMESPACE=flash DFLASH27B_HAVE_BSA=1)
+        target_compile_options(dflash_common PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
+        target_compile_definitions(dflash_common PRIVATE FLASHATTENTION_DISABLE_DROPOUT FLASH_NAMESPACE=flash DFLASH27B_HAVE_BSA=1)
         # MSVC's <cmath> hides POSIX M_* macros (M_LOG2E etc.) unless _USE_MATH_DEFINES
         # is set before any cmath include. BSA's softmax.h relies on M_LOG2E; define
         # globally on the target so it precedes every TU's first <cmath> include.
         if(WIN32)
-            target_compile_definitions(dflash27b PRIVATE _USE_MATH_DEFINES)
+            target_compile_definitions(dflash_common PRIVATE _USE_MATH_DEFINES)
         endif()
     endif()
 endif()
 
-target_link_libraries(dflash27b
+target_link_libraries(dflash_common
     PUBLIC
         ggml
         ${DFLASH27B_GGML_BACKEND_TARGET}
@@ -485,15 +485,15 @@ target_link_libraries(dflash27b
         nlohmann_json::nlohmann_json
 )
 if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
-    target_link_libraries(dflash27b PRIVATE hip::host)
+    target_link_libraries(dflash_common PRIVATE hip::host)
 endif()
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-     target_compile_options(dflash27b PRIVATE
+     target_compile_options(dflash_common PRIVATE
          $<$<COMPILE_LANGUAGE:CXX>:-Wall -Wextra -Wno-unused-parameter -Wno-unused-function>
      )
 elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
-     target_compile_options(dflash27b PRIVATE
+     target_compile_options(dflash_common PRIVATE
          $<$<COMPILE_LANGUAGE:CXX>:/W4 /permissive->
      )
 endif()
@@ -503,32 +503,32 @@ endif()
 if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/pflash_daemon.cpp")
     add_executable(pflash_daemon test/pflash_daemon.cpp)
     target_include_directories(pflash_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-    target_link_libraries(pflash_daemon PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+    target_link_libraries(pflash_daemon PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
 endif()
 
 # ─── Tests (numerics vs oracle) ────────────────────────────────────
 
 option(DFLASH27B_TESTS "Build numerics tests" ON)
 if(DFLASH27B_TESTS)
-    if(DFLASH27B_GPU_BACKEND STREQUAL "cuda" AND _dflash27b_cuda_min_sm GREATER_EQUAL 80 AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flashprefill_kernels.cpp")
+    if(DFLASH27B_GPU_BACKEND STREQUAL "cuda" AND _dflash_cuda_min_sm GREATER_EQUAL 80 AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flashprefill_kernels.cpp")
         add_executable(test_flashprefill_kernels test/test_flashprefill_kernels.cpp)
-        set_target_properties(test_flashprefill_kernels PROPERTIES CUDA_ARCHITECTURES "${_dflash27b_archs}")
+        set_target_properties(test_flashprefill_kernels PROPERTIES CUDA_ARCHITECTURES "${_dflash_archs}")
         target_include_directories(test_flashprefill_kernels PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-        target_link_libraries(test_flashprefill_kernels PRIVATE dflash27b CUDA::cudart)
+        target_link_libraries(test_flashprefill_kernels PRIVATE dflash_common CUDA::cudart)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kv_quant.cpp")
         add_executable(test_kv_quant test/test_kv_quant.cpp)
         target_include_directories(test_kv_quant PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(test_kv_quant PRIVATE dflash27b)
+        target_link_libraries(test_kv_quant PRIVATE dflash_common)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_vs_reference.cpp")
         add_executable(test_draft_vs_reference test/test_draft_vs_reference.cpp)
-        target_link_libraries(test_draft_vs_reference PRIVATE dflash27b)
+        target_link_libraries(test_draft_vs_reference PRIVATE dflash_common)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_draft.cpp")
         add_executable(smoke_load_draft test/smoke_load_draft.cpp)
         target_include_directories(smoke_load_draft PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_load_draft PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(smoke_load_draft PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/spike_thin_copy.cpp")
         add_executable(spike_thin_copy test/spike_thin_copy.cpp)
@@ -538,62 +538,62 @@ if(DFLASH27B_TESTS)
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_draft_graph.cpp")
         add_executable(smoke_draft_graph test/smoke_draft_graph.cpp)
         target_include_directories(smoke_draft_graph PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_draft_graph PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(smoke_draft_graph PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_qwen3_forward.cpp")
         add_executable(smoke_qwen3_forward test/smoke_qwen3_forward.cpp)
         target_include_directories(smoke_qwen3_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_qwen3_forward PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(smoke_qwen3_forward PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_vs_oracle.cpp")
         add_executable(test_vs_oracle test/test_vs_oracle.cpp)
         target_include_directories(test_vs_oracle PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(test_vs_oracle PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(test_vs_oracle PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target.cpp")
         add_executable(smoke_load_target test/smoke_load_target.cpp)
         target_include_directories(smoke_load_target PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_load_target PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(smoke_load_target PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target_laguna.cpp")
         add_executable(smoke_load_target_laguna test/smoke_load_target_laguna.cpp)
         target_include_directories(smoke_load_target_laguna PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_load_target_laguna PRIVATE dflash27b ggml ggml-cuda)
+        target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ggml-cuda)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_laguna_forward.cpp")
         add_executable(smoke_laguna_forward test/smoke_laguna_forward.cpp)
         target_include_directories(smoke_laguna_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_laguna_forward PRIVATE dflash27b ggml ggml-cuda)
+        target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ggml-cuda)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_ttft.cpp")
         add_executable(bench_laguna_ttft test/bench_laguna_ttft.cpp)
         target_include_directories(bench_laguna_ttft PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(bench_laguna_ttft PRIVATE dflash27b ggml ggml-cuda)
+        target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ggml-cuda)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_pflash.cpp")
         add_executable(bench_laguna_pflash test/bench_laguna_pflash.cpp)
         target_include_directories(bench_laguna_pflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(bench_laguna_pflash PRIVATE dflash27b ggml ggml-cuda)
+        target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ggml-cuda)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_generate.cpp")
         add_executable(bench_laguna_generate test/bench_laguna_generate.cpp)
         target_include_directories(bench_laguna_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(bench_laguna_generate PRIVATE dflash27b ggml ggml-cuda)
+        target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ggml-cuda)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_laguna_daemon.cpp")
         add_executable(test_laguna_daemon test/test_laguna_daemon.cpp)
         target_include_directories(test_laguna_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(test_laguna_daemon PRIVATE dflash27b ggml ggml-cuda)
+        target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ggml-cuda)
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_forward.cpp")
         add_executable(smoke_target_forward test/smoke_target_forward.cpp)
         target_include_directories(smoke_target_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(smoke_target_forward PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(smoke_target_forward PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_generate.cpp")
         add_executable(test_generate test/test_generate.cpp)
         target_include_directories(test_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
-        target_link_libraries(test_generate PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(test_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_restore_delta.cpp")
         add_executable(test_restore_delta test/test_restore_delta.cpp)
@@ -607,9 +607,9 @@ if(DFLASH27B_TESTS)
         else()
             target_compile_definitions(test_dflash PRIVATE
                 DFLASH27B_BACKEND_CUDA=1
-                DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm})
+                DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
         endif()
-        target_link_libraries(test_dflash PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(test_dflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
         # test_dflash uses cuda*/hip* runtime calls directly for fast rollback,
         # peer access, and target-layer split copies. Link the selected runtime.
         if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
@@ -637,9 +637,9 @@ if(DFLASH27B_TESTS)
         else()
             target_compile_definitions(dflash_server PRIVATE
                 DFLASH27B_BACKEND_CUDA=1
-                DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm})
+                DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
         endif()
-        target_link_libraries(dflash_server PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread)
+        target_link_libraries(dflash_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread)
         if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
             find_package(CUDAToolkit REQUIRED)
             target_link_libraries(dflash_server PRIVATE CUDA::cudart)
@@ -657,9 +657,9 @@ if(DFLASH27B_TESTS)
         else()
             target_compile_definitions(test_tokenizer_harness PRIVATE
                 DFLASH27B_BACKEND_CUDA=1
-                DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm})
+                DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
         endif()
-        target_link_libraries(test_tokenizer_harness PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(test_tokenizer_harness PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
         if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
             find_package(CUDAToolkit REQUIRED)
             target_link_libraries(test_tokenizer_harness PRIVATE CUDA::cudart)
@@ -679,9 +679,9 @@ if(DFLASH27B_TESTS)
         else()
             target_compile_definitions(test_server_unit PRIVATE
                 DFLASH27B_BACKEND_CUDA=1
-                DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm})
+                DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
         endif()
-        target_link_libraries(test_server_unit PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+        target_link_libraries(test_server_unit PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
         if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
             find_package(CUDAToolkit REQUIRED)
             target_link_libraries(test_server_unit PRIVATE CUDA::cudart)
@@ -723,12 +723,12 @@ if(DFLASH27B_TESTS)
     endforeach()
 
     # Gated on the same condition as src/pflash_ggml_adapter.cpp above:
-    # the adapter is only compiled into dflash27b when the CUDA arch list
+    # the adapter is only compiled into dflash_common when the CUDA arch list
     # includes an sm_80+ target, so this test can only link there too.
     if(DFLASH27B_GPU_BACKEND STREQUAL "cuda" AND _dflash_has_sm80
        AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flash_attn_sparse.cpp")
         add_executable(test_flash_attn_sparse test/test_flash_attn_sparse.cpp)
-        target_link_libraries(test_flash_attn_sparse PRIVATE dflash27b ggml ggml-cuda ggml-base)
+        target_link_libraries(test_flash_attn_sparse PRIVATE dflash_common ggml ggml-cuda ggml-base)
         target_include_directories(test_flash_attn_sparse PRIVATE
             ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/include
             ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/src
diff --git a/dflash/README.md b/dflash/README.md
index e22ee6245..35bc2a2b7 100644
--- a/dflash/README.md
+++ b/dflash/README.md
@@ -210,7 +210,7 @@ Full `bench_llm.py` suite on Qwen3.6-27B UD-Q4_K_XL, 10 prompts, n_gen=256, RTX
 and dispatches by arch:
 
   - `qwen35` / `qwen36` → existing DFlash + DDTree pipeline (no change).
-  - `laguna` → `dflash27b::run_laguna_daemon()` (no spec-decode, no DDTree).
+  - `laguna` → `dflash::common::run_laguna_daemon()` (no spec-decode, no DDTree).
 
 The daemon stdin/stream-fd protocol is identical, so `scripts/server.py`
 drives both arches end-to-end. The only thing the user changes is `--target`.
diff --git a/dflash/scripts/convert_dflash_to_gguf.py b/dflash/scripts/convert_dflash_to_gguf.py
index 72852f173..fae1be7e5 100644
--- a/dflash/scripts/convert_dflash_to_gguf.py
+++ b/dflash/scripts/convert_dflash_to_gguf.py
@@ -21,7 +21,7 @@
 name.
 
 Usage:
-  PYTHONPATH=../../dflash27b_ggml/deps/llama.cpp/gguf-py python convert_dflash_to_gguf.py \
+  PYTHONPATH=../../dflash_ggml/deps/llama.cpp/gguf-py python convert_dflash_to_gguf.py \
     models/draft/model.safetensors \
     qwen3.5-27b-dflash-draft.gguf
 """
diff --git a/dflash/scripts/server.py b/dflash/scripts/server.py
index 78837fd4d..01bd81967 100644
--- a/dflash/scripts/server.py
+++ b/dflash/scripts/server.py
@@ -132,7 +132,7 @@ def _extra_daemon_has_target_sharding(extra: list[str] | None) -> bool:
 # Architecture strings stored in `general.architecture` of every GGUF this
 # server can drive. test_dflash dispatches by GGUF arch internally:
 #   qwen35 / qwen36  -> existing DFlash + DDTree pipeline
-#   laguna           -> dflash27b::run_laguna_daemon() (no spec-decode)
+#   laguna           -> dflash::common::run_laguna_daemon() (no spec-decode)
 # server.py just needs to omit --draft + the DFlash/DDTree flags when the
 # arch doesn't support speculative decoding yet.
 _QWEN35_ARCHES = {"qwen35", "qwen36"}
@@ -843,7 +843,7 @@ async def _openai_compat_error_handler(_request: Request, exc: OpenAICompatError
 
     if arch in _LAGUNA_ARCHES:
         # test_dflash detects arch=laguna from the GGUF and dispatches
-        # internally to dflash27b::run_laguna_daemon(). No --draft, no
+        # internally to dflash::common::run_laguna_daemon(). No --draft, no
         # --fast-rollback, no --ddtree (no Laguna spec-decode draft yet).
         # Tokens stream as int32 LE on stream_fd terminated by -1, byte-
         # identical to the qwen35 path so SSE/stream consumers stay shared.
diff --git a/dflash/src/bsa_launcher.cu b/dflash/src/bsa_launcher.cu
index 19698afe7..258040658 100644
--- a/dflash/src/bsa_launcher.cu
+++ b/dflash/src/bsa_launcher.cu
@@ -28,7 +28,7 @@ template<typename T, int Headdim, bool Is_causal>
 void run_mha_fwd_block_(Flash_fwd_params &params, cudaStream_t stream);
 }
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 namespace {
@@ -275,4 +275,4 @@ fail:
 }
 
 }  // namespace flashprefill
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/bsa_launcher_hip.cu b/dflash/src/bsa_launcher_hip.cu
index 0745d1c2d..870230b95 100644
--- a/dflash/src/bsa_launcher_hip.cu
+++ b/dflash/src/bsa_launcher_hip.cu
@@ -19,7 +19,7 @@
 #include <cstdint>
 #include <cstdlib>          // size_t
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 // Defined in flashprefill_kernels.hip.cu.
@@ -108,4 +108,4 @@ extern "C" int launch_bsa_sparse_flash_forward_bf16(
 }
 
 }  // namespace flashprefill
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/attn_masks.h b/dflash/src/common/attn_masks.h
index b61adab30..ec25ca561 100644
--- a/dflash/src/common/attn_masks.h
+++ b/dflash/src/common/attn_masks.h
@@ -10,7 +10,7 @@
 #include <cstdint>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Minimum alignment required by ggml flash_attn_ext for mask rows.
 static constexpr int KQ_MASK_PAD = 32;
@@ -75,4 +75,4 @@ inline void build_tree_mask(const DDTree & tree, int past_length,
     }
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/backend_factory.cpp b/dflash/src/common/backend_factory.cpp
index 30da7dec0..018045694 100644
--- a/dflash/src/common/backend_factory.cpp
+++ b/dflash/src/common/backend_factory.cpp
@@ -10,7 +10,7 @@
 
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 std::string detect_arch(const char * model_path) {
     auto info = inspect_gguf_model_info(model_path);
@@ -107,4 +107,4 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args) {
     }
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/backend_factory.h b/dflash/src/common/backend_factory.h
index e4caed652..5a4195502 100644
--- a/dflash/src/common/backend_factory.h
+++ b/dflash/src/common/backend_factory.h
@@ -16,7 +16,7 @@
 #include <memory>
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ─── Backend creation arguments ─────────────────────────────────────────
 // A superset of all per-arch config fields. The factory reads only those
@@ -62,4 +62,4 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args);
 // Useful for early dispatch (e.g. printing which backend will be used).
 std::string detect_arch(const char * model_path);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/daemon_loop.cpp b/dflash/src/common/daemon_loop.cpp
index ecc58f67d..94bd5c509 100644
--- a/dflash/src/common/daemon_loop.cpp
+++ b/dflash/src/common/daemon_loop.cpp
@@ -25,7 +25,7 @@
 #define ssize_t long
 #endif
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── DaemonIO ────────────────────────────────────────────────────────────
 
@@ -424,4 +424,4 @@ int run_daemon(ModelBackend & backend, const DaemonLoopArgs & args) {
     return 0;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/daemon_loop.h b/dflash/src/common/daemon_loop.h
index 7d8012ecf..de5f859b1 100644
--- a/dflash/src/common/daemon_loop.h
+++ b/dflash/src/common/daemon_loop.h
@@ -11,7 +11,7 @@
 
 #include "model_backend.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct DaemonLoopArgs {
     int stream_fd = -1;
@@ -23,4 +23,4 @@ struct DaemonLoopArgs {
 // commands until `quit`, `exit`, or EOF.  Returns 0 on clean shutdown.
 int run_daemon(ModelBackend & backend, const DaemonLoopArgs & args);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/ddtree.cpp b/dflash/src/common/ddtree.cpp
index e108f259d..08ca33464 100644
--- a/dflash/src/common/ddtree.cpp
+++ b/dflash/src/common/ddtree.cpp
@@ -7,7 +7,7 @@
 #include <cmath>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 void extract_draft_topk(const float * logits,
                         int n_positions, int vocab, int K,
@@ -223,4 +223,4 @@ std::vector<int> follow_verified_tree(const DDTree & tree,
     return accepted;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/ddtree.h b/dflash/src/common/ddtree.h
index 75b8d8cce..afe22f226 100644
--- a/dflash/src/common/ddtree.h
+++ b/dflash/src/common/ddtree.h
@@ -13,7 +13,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // A flat DFS-ordered tree built from the draft's top-K softmax distributions.
 // Slot 0 is the tree root (the bonus token from the previous spec round);
@@ -61,4 +61,4 @@ std::vector<int> follow_verified_tree(const DDTree & tree,
                                       int & out_next_token,
                                       int * out_node_idx = nullptr);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/device_placement.h b/dflash/src/common/device_placement.h
index 0b373e463..a6f87e171 100644
--- a/dflash/src/common/device_placement.h
+++ b/dflash/src/common/device_placement.h
@@ -9,7 +9,7 @@
 
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct DevicePlacement {
     int gpu = 0;                              // primary GPU (single-GPU mode)
@@ -28,4 +28,4 @@ struct DevicePlacement {
     }
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/dflash_capture.cpp b/dflash/src/common/dflash_capture.cpp
index 3b42f771f..a67fe9b9a 100644
--- a/dflash/src/common/dflash_capture.cpp
+++ b/dflash/src/common/dflash_capture.cpp
@@ -1,6 +1,6 @@
 #include "dflash_capture.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 int target_capture_index(const int * capture_layer_ids,
                          int n_capture_layers,
@@ -12,4 +12,4 @@ int target_capture_index(const int * capture_layer_ids,
     return -1;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/dflash_capture.h b/dflash/src/common/dflash_capture.h
index b96d34bd1..8e4415097 100644
--- a/dflash/src/common/dflash_capture.h
+++ b/dflash/src/common/dflash_capture.h
@@ -7,7 +7,7 @@
 
 #pragma once
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Linear search for layer_idx in capture_layer_ids[0..n_capture_layers).
 // Returns the capture index (0..n_capture_layers-1) on hit, -1 on miss.
@@ -15,4 +15,4 @@ int target_capture_index(const int * capture_layer_ids,
                          int n_capture_layers,
                          int layer_idx);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/dflash_draft_graph.cpp b/dflash/src/common/dflash_draft_graph.cpp
index ca46219d7..2e60acb6b 100644
--- a/dflash/src/common/dflash_draft_graph.cpp
+++ b/dflash/src/common/dflash_draft_graph.cpp
@@ -5,7 +5,7 @@
 
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Build draft graph at a given ctx_len into sg. Does NOT touch sg.alloc.
 // mirror_view: if true, uses a view into mirror->target_feat at slot0.
@@ -128,4 +128,4 @@ bool build_draft_step(
     return ggml_gallocr_alloc_graph(sg.alloc, sg.gf);
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/dflash_draft_graph.h b/dflash/src/common/dflash_draft_graph.h
index e6bbf1fb7..c09110270 100644
--- a/dflash/src/common/dflash_draft_graph.h
+++ b/dflash/src/common/dflash_draft_graph.h
@@ -15,7 +15,7 @@
 #include "ggml.h"
 #include "ggml-backend.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Draft forward: speculative next-token prediction using target features.
 //   lm_head: optional target lm_head tensor for fused projection. When
@@ -33,4 +33,4 @@ bool build_draft_step(
     int committed = 0,
     int ctx_len_max = 0);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/dflash_draft_ipc.cpp b/dflash/src/common/dflash_draft_ipc.cpp
index b23005f1f..0cab347b8 100644
--- a/dflash/src/common/dflash_draft_ipc.cpp
+++ b/dflash/src/common/dflash_draft_ipc.cpp
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── DFlashDraftIpcClient ────────────────────────────────────────────
 
@@ -248,4 +248,4 @@ bool copy_capture_slice_to_remote_draft(
     return remote.send_feature_slice(capture_idx, start_pos, n_tokens, host);
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/common/dflash_draft_ipc.h b/dflash/src/common/dflash_draft_ipc.h
index 26877b4f7..f5eb47ac4 100644
--- a/dflash/src/common/dflash_draft_ipc.h
+++ b/dflash/src/common/dflash_draft_ipc.h
@@ -31,7 +31,7 @@
 #  include <unistd.h>
 #endif
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── IPC Client (parent process) ─────────────────────────────────────
 
@@ -120,4 +120,4 @@ int run_dflash_draft_ipc_daemon(const char * draft_path,
                                 int draft_gpu,
                                 int stream_fd);
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/common/dflash_draft_ipc_daemon.cpp b/dflash/src/common/dflash_draft_ipc_daemon.cpp
index 6653fc436..176798f85 100644
--- a/dflash/src/common/dflash_draft_ipc_daemon.cpp
+++ b/dflash/src/common/dflash_draft_ipc_daemon.cpp
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 int run_dflash_draft_ipc_daemon(const char * draft_path,
                                 int ring_cap,
@@ -208,4 +208,4 @@ int run_dflash_draft_ipc_daemon(const char * draft_path,
 #endif
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/common/dflash_feature_ring.cpp b/dflash/src/common/dflash_feature_ring.cpp
index f8ab43ea3..8cf8fa83a 100644
--- a/dflash/src/common/dflash_feature_ring.cpp
+++ b/dflash/src/common/dflash_feature_ring.cpp
@@ -16,7 +16,7 @@ extern "C++" to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
 
 #include "gpu_runtime_compat.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── internal helpers ────────────────────────────────────────────
 
@@ -236,4 +236,4 @@ bool copy_feature_ring_range_to_tensor(
     return cudaDeviceSynchronize() == cudaSuccess;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/dflash_feature_ring.h b/dflash/src/common/dflash_feature_ring.h
index 133b69199..e681cecfb 100644
--- a/dflash/src/common/dflash_feature_ring.h
+++ b/dflash/src/common/dflash_feature_ring.h
@@ -19,7 +19,7 @@
 #include <cstddef>
 #include <cstdint>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct DraftFeatureMirror {
     ggml_context * ctx = nullptr;
@@ -88,4 +88,4 @@ bool copy_feature_ring_range_to_tensor(
     int start_pos,
     int n_tokens);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/dflash_layer_split_runtime.h b/dflash/src/common/dflash_layer_split_runtime.h
index 5e302e660..5c539989f 100644
--- a/dflash/src/common/dflash_layer_split_runtime.h
+++ b/dflash/src/common/dflash_layer_split_runtime.h
@@ -13,7 +13,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── Runtime configuration (replaces globals) ────────────────────────
 
@@ -74,4 +74,4 @@ inline bool activation_pair_init(ActivationPair & p,
     return true;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/common/dflash_spec_decode.cpp b/dflash/src/common/dflash_spec_decode.cpp
index aa1831f34..9284a3440 100644
--- a/dflash/src/common/dflash_spec_decode.cpp
+++ b/dflash/src/common/dflash_spec_decode.cpp
@@ -12,7 +12,7 @@
 #include <cstdio>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 namespace {
 // RAII guard so any early `return false` path frees the per-call draft graph.
@@ -205,5 +205,5 @@ bool run_dflash_spec_decode(
     return true;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
 
diff --git a/dflash/src/common/dflash_spec_decode.h b/dflash/src/common/dflash_spec_decode.h
index 1b35735f0..1337e1ff0 100644
--- a/dflash/src/common/dflash_spec_decode.h
+++ b/dflash/src/common/dflash_spec_decode.h
@@ -21,7 +21,7 @@
 #include <cstdint>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct DraftWeights;  // forward-decl from internal.h
 
@@ -50,4 +50,4 @@ bool run_dflash_spec_decode(
         int stream_fd = -1,
         DFlashDraftIpcClient * remote_draft = nullptr);
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/common/dflash_target.h b/dflash/src/common/dflash_target.h
index 3d563cf15..56fd4bece 100644
--- a/dflash/src/common/dflash_target.h
+++ b/dflash/src/common/dflash_target.h
@@ -14,7 +14,7 @@
 #include <cstdint>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct DFlashTarget {
     virtual ~DFlashTarget() = default;
@@ -74,4 +74,4 @@ struct DFlashTarget {
     virtual const std::vector<int> & capture_layer_ids() const = 0;
 };
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/common/gguf_inspect.cpp b/dflash/src/common/gguf_inspect.cpp
index 7baf3d6b2..95cc30c41 100644
--- a/dflash/src/common/gguf_inspect.cpp
+++ b/dflash/src/common/gguf_inspect.cpp
@@ -5,7 +5,7 @@
 #include <cstring>
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 GgufModelInfo inspect_gguf_model_info(const char * path) {
     GgufModelInfo info;
@@ -36,4 +36,4 @@ GgufModelInfo inspect_gguf_model_info(const char * path) {
     return info;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/gguf_inspect.h b/dflash/src/common/gguf_inspect.h
index be84c9109..11c11379e 100644
--- a/dflash/src/common/gguf_inspect.h
+++ b/dflash/src/common/gguf_inspect.h
@@ -7,7 +7,7 @@
 
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct GgufModelInfo {
     std::string arch;       // e.g. "qwen35", "laguna", "qwen3", "gemma4"
@@ -18,4 +18,4 @@ struct GgufModelInfo {
 // Returns info with arch="" and n_layer=-1 on failure.
 GgufModelInfo inspect_gguf_model_info(const char * path);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/io_utils.h b/dflash/src/common/io_utils.h
index 8fc14d29d..cf0c4a46f 100644
--- a/dflash/src/common/io_utils.h
+++ b/dflash/src/common/io_utils.h
@@ -21,7 +21,7 @@
 #  include <unistd.h>
 #endif
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── Binary file I/O ────────────────────────────────────────────────
 
@@ -134,4 +134,4 @@ static inline int argmax_f32(const float * x, int n) {
     return best;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/common/layer_split_utils.cpp b/dflash/src/common/layer_split_utils.cpp
index 3c49b4f66..57bf0db49 100644
--- a/dflash/src/common/layer_split_utils.cpp
+++ b/dflash/src/common/layer_split_utils.cpp
@@ -4,7 +4,7 @@
 #include <cmath>
 #include <set>
 
-namespace dflash27b {
+namespace dflash::common {
 
 std::vector<std::pair<int,int>> compute_layer_ranges(
     int n_layer,
@@ -86,4 +86,4 @@ std::string validate_device_placement(
     return {};  // ok
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/layer_split_utils.h b/dflash/src/common/layer_split_utils.h
index fcb2ba2e2..ba675947c 100644
--- a/dflash/src/common/layer_split_utils.h
+++ b/dflash/src/common/layer_split_utils.h
@@ -11,7 +11,7 @@
 #include <utility>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Compute [begin, end) layer ranges for each GPU shard.
 // If weights is empty, splits layers equally.
@@ -28,4 +28,4 @@ std::string validate_device_placement(
     const DevicePlacement & dp,
     int cuda_device_count);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/model_backend.h b/dflash/src/common/model_backend.h
index 9a3de0cae..504b68eb0 100644
--- a/dflash/src/common/model_backend.h
+++ b/dflash/src/common/model_backend.h
@@ -19,7 +19,7 @@
 #include "ggml-backend.h"
 #include "sampler.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Token callback for streaming generation. Called once per committed token.
 // Return true to continue generation, false to abort.
@@ -180,4 +180,4 @@ struct ModelBackend {
     virtual void shutdown() = 0;
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/peer_access.cpp b/dflash/src/common/peer_access.cpp
index fb0dedf84..9658dce9d 100644
--- a/dflash/src/common/peer_access.cpp
+++ b/dflash/src/common/peer_access.cpp
@@ -3,7 +3,7 @@
 
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── global state ────────────────────────────────────────────────
 bool g_peer_access_opt_in = false;
@@ -100,4 +100,4 @@ bool copy_peer_async(void * dst, int dst_device,
 #endif
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/peer_access.h b/dflash/src/common/peer_access.h
index db2ea21fc..f6e2465c7 100644
--- a/dflash/src/common/peer_access.h
+++ b/dflash/src/common/peer_access.h
@@ -15,7 +15,7 @@
 
 #include "gpu_runtime_compat.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── global state ────────────────────────────────────────────────
 // Set from argv: opt into cudaMemcpyPeerAsync for cross-device copies.
@@ -34,4 +34,4 @@ bool copy_peer_async(void * dst, int dst_device,
                      size_t bytes,
                      cudaStream_t stream = nullptr);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/restore_delta.h b/dflash/src/common/restore_delta.h
index 11ee24bb4..3b5814cfc 100644
--- a/dflash/src/common/restore_delta.h
+++ b/dflash/src/common/restore_delta.h
@@ -4,7 +4,7 @@
 #include <stdexcept>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 inline std::vector<int32_t> restore_prompt_delta(const std::vector<int32_t> & prompt,
                                                  int cached_prefix_len) {
@@ -19,4 +19,4 @@ inline std::vector<int32_t> restore_prompt_delta(const std::vector<int32_t> & pr
         prompt.end());
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/sampler.cpp b/dflash/src/common/sampler.cpp
index a6879f1c7..c05bdd3e2 100644
--- a/dflash/src/common/sampler.cpp
+++ b/dflash/src/common/sampler.cpp
@@ -8,7 +8,7 @@
 #include <unordered_set>
 #include <utility>
 
-namespace dflash27b {
+namespace dflash::common {
 
 int sample_logits(const float * logits_in,
                   int vocab,
@@ -95,4 +95,4 @@ bool parse_sampler_token(std::string & line, SamplerCfg & out) {
     return true;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/sampler.h b/dflash/src/common/sampler.h
index 59e1ca131..e7a86dc02 100644
--- a/dflash/src/common/sampler.h
+++ b/dflash/src/common/sampler.h
@@ -1,6 +1,6 @@
 // Shared CPU sampler chain used by both target arches.
 //
-// dflash27b daemon protocol embeds optional sampler params as a tail on each
+// dflash::common daemon protocol embeds optional sampler params as a tail on each
 // generate command: ` samp=temp,top_p,top_k,rep_pen,seed`. parse_sampler_token
 // strips the tail in place and fills a SamplerCfg; sample_logits applies the
 // chain rep_penalty -> top_k -> softmax(temp) -> top_p -> draw.
@@ -15,7 +15,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct SamplerCfg {
     float    temp       = 0.0f;
@@ -39,4 +39,4 @@ int sample_logits(const float * logits_in,
 // top_k=0, rep_pen=1, seed=0).
 bool parse_sampler_token(std::string & line, SamplerCfg & out);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/snapshot_backend.h b/dflash/src/common/snapshot_backend.h
index 60636a81f..a77e70a47 100644
--- a/dflash/src/common/snapshot_backend.h
+++ b/dflash/src/common/snapshot_backend.h
@@ -17,7 +17,7 @@
 
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Select or create a backend for prefix cache snapshot storage.
 //
@@ -53,4 +53,4 @@ inline void free_snapshot_backend(ggml_backend_t snap_backend,
     }
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/common/step_graph.h b/dflash/src/common/step_graph.h
index 9ff4f4305..5e18f3f61 100644
--- a/dflash/src/common/step_graph.h
+++ b/dflash/src/common/step_graph.h
@@ -14,7 +14,7 @@
 
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct StepGraph {
     ggml_context *  ctx = nullptr;
@@ -66,4 +66,4 @@ inline void step_graph_destroy(StepGraph & sg) {
     step_graph_free(sg);
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/delta_net_chunked.cpp b/dflash/src/delta_net_chunked.cpp
index e91bfc862..c3421bf2b 100644
--- a/dflash/src/delta_net_chunked.cpp
+++ b/dflash/src/delta_net_chunked.cpp
@@ -19,7 +19,7 @@
 
 #include <cmath>
 
-namespace dflash27b {
+namespace dflash::common {
 
 static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) {
     return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3],
@@ -234,4 +234,4 @@ DeltaNetChunkedResult build_delta_net_chunked(
     return r;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/delta_net_chunked.h b/dflash/src/delta_net_chunked.h
index 80e6e00d6..173c97d54 100644
--- a/dflash/src/delta_net_chunked.h
+++ b/dflash/src/delta_net_chunked.h
@@ -4,7 +4,7 @@
 
 #include <ggml.h>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct DeltaNetChunkedResult {
     ggml_tensor * output;     // [S_v, H_v, n_tokens, n_seqs]
@@ -24,4 +24,4 @@ DeltaNetChunkedResult build_delta_net_chunked(
         ggml_tensor  * b,
         ggml_tensor  * s);
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/draft/draft_dflash_graph.cpp b/dflash/src/draft/draft_dflash_graph.cpp
index b83ccb0bb..eddfba9a1 100644
--- a/dflash/src/draft/draft_dflash_graph.cpp
+++ b/dflash/src/draft/draft_dflash_graph.cpp
@@ -33,7 +33,7 @@
 
 #include <cmath>
 
-namespace dflash27b {
+namespace dflash::common {
 
 DraftGraphOutputs build_draft_graph(
     ggml_context *            ctx,
@@ -182,4 +182,4 @@ DraftGraphOutputs build_draft_graph(
     return og;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/draft/draft_gguf_loader.cpp b/dflash/src/draft/draft_gguf_loader.cpp
index 35b503279..89f7b17cf 100644
--- a/dflash/src/draft/draft_gguf_loader.cpp
+++ b/dflash/src/draft/draft_gguf_loader.cpp
@@ -39,7 +39,7 @@
 #include <unistd.h>
 #endif
 
-namespace dflash27b {
+namespace dflash::common {
 
 namespace {
 
@@ -346,4 +346,4 @@ bool load_draft_gguf(const std::string & path,
     return true;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/draft/draft_graph.h b/dflash/src/draft/draft_graph.h
index 304ff8e39..28bc0d837 100644
--- a/dflash/src/draft/draft_graph.h
+++ b/dflash/src/draft/draft_graph.h
@@ -3,7 +3,7 @@
 
 #include "ggml.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct DraftWeights; // fwd
 
@@ -30,4 +30,4 @@ DraftGraphOutputs build_draft_graph(
     const DraftWeights &      w,
     const DraftGraphInputs &  in);
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/draft/draft_safetensors_loader.cpp b/dflash/src/draft/draft_safetensors_loader.cpp
index 18825d7c6..d1ebcb989 100644
--- a/dflash/src/draft/draft_safetensors_loader.cpp
+++ b/dflash/src/draft/draft_safetensors_loader.cpp
@@ -55,7 +55,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 namespace {
 
@@ -646,4 +646,4 @@ void free_draft_weights(DraftWeights & w) {
     w.out_norm = nullptr;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/errors.cpp b/dflash/src/errors.cpp
index 7ea8b0c96..970b771fe 100644
--- a/dflash/src/errors.cpp
+++ b/dflash/src/errors.cpp
@@ -7,7 +7,7 @@
 #include <mutex>
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 namespace {
 std::mutex g_err_mu;
@@ -19,9 +19,9 @@ void set_last_error(std::string msg) {
     g_last_error = std::move(msg);
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
 
 extern "C" const char * dflash27b_last_error(void) {
-    std::lock_guard<std::mutex> lk(dflash27b::g_err_mu);
-    return dflash27b::g_last_error.c_str();
+    std::lock_guard<std::mutex> lk(dflash::common::g_err_mu);
+    return dflash::common::g_last_error.c_str();
 }
diff --git a/dflash/src/flashprefill.cpp b/dflash/src/flashprefill.cpp
index e83be1e77..0745c66bc 100644
--- a/dflash/src/flashprefill.cpp
+++ b/dflash/src/flashprefill.cpp
@@ -12,7 +12,7 @@
 #include <vector>
 #include "device_runtime.h"
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 // Kernel launcher declarations — architecture-specific.
@@ -637,4 +637,4 @@ int flash_prefill_forward_f16(
 }
 
 } // namespace flashprefill
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/flashprefill.h b/dflash/src/flashprefill.h
index ad7c70c0c..1cb0f66d2 100644
--- a/dflash/src/flashprefill.h
+++ b/dflash/src/flashprefill.h
@@ -32,7 +32,7 @@
 #include <cstdint>
 #include "ggml-backend.h"
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 // Algorithmic parameters for the FlashPrefill selection + sparse forward.
@@ -98,4 +98,4 @@ extern "C" void dflash_bsa_free_persistent();
 #endif
 
 } // namespace flashprefill
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/flashprefill_f16.cu b/dflash/src/flashprefill_f16.cu
index 645904789..80cd0204f 100644
--- a/dflash/src/flashprefill_f16.cu
+++ b/dflash/src/flashprefill_f16.cu
@@ -19,7 +19,7 @@
 #include <cuda_fp16.h>
 #include <mma.h>
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 // ── Kernel 1: compute_mean_vector (F16) ──────────────────────────────
@@ -673,6 +673,6 @@ extern "C" void launch_block_select_f16(
 }
 
 } // namespace flashprefill
-} // namespace dflash27b
+} // namespace dflash::common
 
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
diff --git a/dflash/src/flashprefill_kernels.cu b/dflash/src/flashprefill_kernels.cu
index 490f32c49..2dcfc80ce 100644
--- a/dflash/src/flashprefill_kernels.cu
+++ b/dflash/src/flashprefill_kernels.cu
@@ -39,7 +39,7 @@
 #include <mma.h>
 #endif
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 // ── cp.async helpers (sm_8x) ─────────────────────────────────────────
@@ -1047,6 +1047,6 @@ extern "C" void launch_block_select(
 }
 
 } // namespace flashprefill
-} // namespace dflash27b
+} // namespace dflash::common
 
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800
diff --git a/dflash/src/flashprefill_kernels.hip.cu b/dflash/src/flashprefill_kernels.hip.cu
index dc81d1d6d..d7936f378 100644
--- a/dflash/src/flashprefill_kernels.hip.cu
+++ b/dflash/src/flashprefill_kernels.hip.cu
@@ -23,7 +23,7 @@
 #include <hip/hip_bfloat16.h>
 #include <rocwmma/rocwmma.hpp>
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 // ---- Kernel 1: compute_mean_vector ----
@@ -735,4 +735,4 @@ extern "C" void launch_block_select(
 // launch_rms_norm_mul_w_f32 is defined in rms_norm_hip.cu (compiled for all HIP builds).
 
 } // namespace flashprefill
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/flashprefill_q8.cpp b/dflash/src/flashprefill_q8.cpp
index df876d5b7..e3e2737b2 100644
--- a/dflash/src/flashprefill_q8.cpp
+++ b/dflash/src/flashprefill_q8.cpp
@@ -21,7 +21,7 @@
 #include <cstdio>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 namespace {
@@ -166,4 +166,4 @@ int flash_prefill_forward_q8(
 }
 
 } // namespace flashprefill
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/flashprefill_scalar.cu b/dflash/src/flashprefill_scalar.cu
index 6d7ffa5bd..1d90fcbce 100644
--- a/dflash/src/flashprefill_scalar.cu
+++ b/dflash/src/flashprefill_scalar.cu
@@ -19,7 +19,7 @@
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 // =============================================================================
@@ -512,6 +512,6 @@ extern "C" void launch_block_select_pascal(
 }
 
 } // namespace flashprefill
-} // namespace dflash27b
+} // namespace dflash::common
 
 #endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600 && __CUDA_ARCH__ < 700)
diff --git a/dflash/src/flashprefill_select.cpp b/dflash/src/flashprefill_select.cpp
index a18b05d3e..4480d8e40 100644
--- a/dflash/src/flashprefill_select.cpp
+++ b/dflash/src/flashprefill_select.cpp
@@ -18,7 +18,7 @@
 #include <cstdint>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 namespace flashprefill {
 
 // score: [B, M, N, H] row-major (B outer, H fastest).
@@ -84,4 +84,4 @@ void block_select_host(
 }
 
 } // namespace flashprefill
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/gemma4/gemma4_backend.cpp b/dflash/src/gemma4/gemma4_backend.cpp
index ea33beed5..5b6a0725c 100644
--- a/dflash/src/gemma4/gemma4_backend.cpp
+++ b/dflash/src/gemma4/gemma4_backend.cpp
@@ -17,7 +17,7 @@
 #include <cstdio>
 #include <cmath>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── Ctor / dtor ────────────────────────────────────────────────────────
 
@@ -370,4 +370,4 @@ void Gemma4Backend::shutdown() {
     std::printf("[gemma4] shutdown\n"); std::fflush(stdout);
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/gemma4/gemma4_backend.h b/dflash/src/gemma4/gemma4_backend.h
index 312466bc8..84fa08b35 100644
--- a/dflash/src/gemma4/gemma4_backend.h
+++ b/dflash/src/gemma4/gemma4_backend.h
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct Gemma4BackendConfig {
     const char *    model_path = nullptr;
@@ -89,4 +89,4 @@ class Gemma4Backend : public ModelBackend {
                    const DaemonIO & io);
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/gemma4/gemma4_daemon.cpp b/dflash/src/gemma4/gemma4_daemon.cpp
index e3a495633..8c653e001 100644
--- a/dflash/src/gemma4/gemma4_daemon.cpp
+++ b/dflash/src/gemma4/gemma4_daemon.cpp
@@ -6,7 +6,7 @@
 
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 int run_gemma4_daemon(const Gemma4DaemonArgs & args) {
     Gemma4BackendConfig cfg;
@@ -26,4 +26,4 @@ int run_gemma4_daemon(const Gemma4DaemonArgs & args) {
     return run_daemon(backend, da);
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/gemma4/gemma4_daemon.h b/dflash/src/gemma4/gemma4_daemon.h
index 46cd960a8..7f753b7bf 100644
--- a/dflash/src/gemma4/gemma4_daemon.h
+++ b/dflash/src/gemma4/gemma4_daemon.h
@@ -5,7 +5,7 @@
 #include "device_placement.h"
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct Gemma4DaemonArgs {
     const char *     model_path = nullptr;
@@ -17,4 +17,4 @@ struct Gemma4DaemonArgs {
 
 int run_gemma4_daemon(const Gemma4DaemonArgs & args);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/gemma4/gemma4_graph.cpp b/dflash/src/gemma4/gemma4_graph.cpp
index a630c9fe8..c4522edd7 100644
--- a/dflash/src/gemma4/gemma4_graph.cpp
+++ b/dflash/src/gemma4/gemma4_graph.cpp
@@ -28,7 +28,7 @@
 #include "ggml-cuda.h"
 #include "ggml-alloc.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 static constexpr float GEMMA4_EPS = 1e-6f;
 
@@ -445,4 +445,4 @@ bool gemma4_step(
     return true;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/gemma4/gemma4_internal.h b/dflash/src/gemma4/gemma4_internal.h
index f4f49c4e2..d34107b75 100644
--- a/dflash/src/gemma4/gemma4_internal.h
+++ b/dflash/src/gemma4/gemma4_internal.h
@@ -19,7 +19,7 @@
 
 #include "internal.h"  // CpuEmbedder
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct Gemma4Layer {
     // Pre-attn norm
@@ -181,4 +181,4 @@ bool gemma4_step(
     int                     kv_start,
     std::vector<float> &    out_logits);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/gemma4/gemma4_loader.cpp b/dflash/src/gemma4/gemma4_loader.cpp
index de5621e60..d40db53a4 100644
--- a/dflash/src/gemma4/gemma4_loader.cpp
+++ b/dflash/src/gemma4/gemma4_loader.cpp
@@ -27,7 +27,7 @@
 #include <unistd.h>
 #endif
 
-namespace dflash27b {
+namespace dflash::common {
 
 namespace {
 
@@ -367,4 +367,4 @@ void free_gemma4_snapshot(Gemma4Snapshot & s) {
     s.cur_pos = 0;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/internal.h b/dflash/src/internal.h
index 05bc6823d..6f5666df6 100644
--- a/dflash/src/internal.h
+++ b/dflash/src/internal.h
@@ -1,4 +1,4 @@
-// Internal-only shared header for dflash27b library sources.
+// Internal-only shared header for dflash::common library sources.
 // Not installed, not exposed in the public API.
 
 #pragma once
@@ -24,7 +24,7 @@
 
 #include "dflash27b.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Single source of truth for error reporting.
 // All loaders / graph builders push into this via set_last_error(...).
@@ -536,7 +536,7 @@ ggml_tensor * build_qwen35_layer(
     ggml_tensor *         q_tail_capture = nullptr,
     int                   q_tail_start = 0);
 
-} // namespace dflash27b
+} // namespace dflash::common
 
 #if defined(GGML_USE_CUDA) && !defined(GGML_USE_HIP)
 #include <cuda_runtime.h>
diff --git a/dflash/src/kv_cache.cpp b/dflash/src/kv_cache.cpp
index b2006eaa0..0ffe21834 100644
--- a/dflash/src/kv_cache.cpp
+++ b/dflash/src/kv_cache.cpp
@@ -19,8 +19,8 @@
 
 #include "internal.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Placeholder; real impl lives with the spec_loop driver.
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/kv_quant.cpp b/dflash/src/kv_quant.cpp
index f2a51c648..676a103d2 100644
--- a/dflash/src/kv_quant.cpp
+++ b/dflash/src/kv_quant.cpp
@@ -1,4 +1,4 @@
-// KV-cache quantisation helpers for dflash27b.
+// KV-cache quantisation helpers for dflash::common.
 //
 // Centralises the supported (K, V) ggml_type pair table and environment-variable
 // resolution that was previously inlined in qwen35_target_graph.cpp.
diff --git a/dflash/src/laguna/laguna_backend.cpp b/dflash/src/laguna/laguna_backend.cpp
index 9781fdb49..ca64ab943 100644
--- a/dflash/src/laguna/laguna_backend.cpp
+++ b/dflash/src/laguna/laguna_backend.cpp
@@ -21,7 +21,7 @@
 #include <fstream>
 #include <sstream>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── Construction / initialisation ───────────────────────────────────────
 
@@ -398,7 +398,7 @@ bool LagunaBackend::handle_compress(const std::string & line,
 
 void LagunaBackend::free_drafter() {
     if (drafter_loaded_) {
-        dflash27b::free_drafter(drafter_ctx_);
+        dflash::common::free_drafter(drafter_ctx_);
         drafter_loaded_ = false;
         std::printf("[drafter] freed\n"); std::fflush(stdout);
     }
@@ -409,7 +409,7 @@ void LagunaBackend::free_drafter() {
 void LagunaBackend::shutdown() {
     for (auto & snap : snapshots_) laguna_snapshot_free(snap);
     if (drafter_loaded_) {
-        dflash27b::free_drafter(drafter_ctx_);
+        dflash::common::free_drafter(drafter_ctx_);
         drafter_loaded_ = false;
     }
     if (!target_parked_) {
@@ -424,4 +424,4 @@ void LagunaBackend::shutdown() {
     }
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/laguna/laguna_backend.h b/dflash/src/laguna/laguna_backend.h
index 387e758a6..7e487d558 100644
--- a/dflash/src/laguna/laguna_backend.h
+++ b/dflash/src/laguna/laguna_backend.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct LagunaBackendArgs {
     std::string target_path;
@@ -78,4 +78,4 @@ class LagunaBackend : public ModelBackend {
     bool ensure_slot(int slot);
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/laguna/laguna_daemon.cpp b/dflash/src/laguna/laguna_daemon.cpp
index 06ec21458..3a64e0b8d 100644
--- a/dflash/src/laguna/laguna_daemon.cpp
+++ b/dflash/src/laguna/laguna_daemon.cpp
@@ -16,7 +16,7 @@
 
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 int run_laguna_daemon(const LagunaDaemonArgs & args) {
     LagunaBackendArgs bargs;
@@ -36,4 +36,4 @@ int run_laguna_daemon(const LagunaDaemonArgs & args) {
     return run_daemon(backend, dargs);
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/laguna/laguna_daemon.h b/dflash/src/laguna/laguna_daemon.h
index 53a78566b..527582776 100644
--- a/dflash/src/laguna/laguna_daemon.h
+++ b/dflash/src/laguna/laguna_daemon.h
@@ -17,7 +17,7 @@
 #include <string>
 #include "ggml.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct LagunaDaemonArgs {
     std::string     target_path;       // path to laguna-*.gguf
@@ -37,4 +37,4 @@ struct LagunaDaemonArgs {
 // `exit`, or EOF. Returns the process exit code (0 on clean shutdown).
 int run_laguna_daemon(const LagunaDaemonArgs & args);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/laguna/laguna_internal.h b/dflash/src/laguna/laguna_internal.h
index a98d7a8b5..7c28c43ae 100644
--- a/dflash/src/laguna/laguna_internal.h
+++ b/dflash/src/laguna/laguna_internal.h
@@ -30,7 +30,7 @@
 
 #include "internal.h"  // for CpuEmbedder
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct LagunaTargetLayer {
     // Pre-attn + pre-ffn norms (Laguna has only these two; no post norms).
@@ -252,4 +252,4 @@ bool laguna_step(
     bool                        no_mask,
     std::vector<float> &        out_logits);
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/laguna/laguna_target_graph.cpp b/dflash/src/laguna/laguna_target_graph.cpp
index 7d3c21788..8f2e3c638 100644
--- a/dflash/src/laguna/laguna_target_graph.cpp
+++ b/dflash/src/laguna/laguna_target_graph.cpp
@@ -33,7 +33,7 @@
 #include "ggml-cuda.h"
 #include "ggml-alloc.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 static constexpr float LAGUNA_EPS = 1e-6f;
 
@@ -785,4 +785,4 @@ bool laguna_step(
     return true;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/laguna/laguna_target_loader.cpp b/dflash/src/laguna/laguna_target_loader.cpp
index 77e0acb58..c3ca44469 100644
--- a/dflash/src/laguna/laguna_target_loader.cpp
+++ b/dflash/src/laguna/laguna_target_loader.cpp
@@ -55,7 +55,7 @@
 #include <unistd.h>
 #endif
 
-namespace dflash27b {
+namespace dflash::common {
 
 namespace {
 
@@ -473,4 +473,4 @@ void free_laguna_target_weights(LagunaTargetWeights & w) {
     w.output   = nullptr;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/pflash_ggml_adapter.cpp b/dflash/src/pflash_ggml_adapter.cpp
index 4862d379b..93434761f 100644
--- a/dflash/src/pflash_ggml_adapter.cpp
+++ b/dflash/src/pflash_ggml_adapter.cpp
@@ -11,7 +11,7 @@ static int pflash_adapter(
     int batch, int seq_len, int n_q_heads, int n_k_heads, int head_dim,
     float scale, float alpha)
 {
-    dflash27b::flashprefill::FlashPrefillConfig cfg;
+    dflash::common::flashprefill::FlashPrefillConfig cfg;
     if (alpha >= 1.0f) {
         // alpha >= 1.0 means "select all blocks" — configure for dense attention
         cfg.alpha          = 0.0f;
@@ -21,7 +21,7 @@ static int pflash_adapter(
     } else {
         cfg.alpha = alpha;
     }
-    return dflash27b::flashprefill::flash_prefill_forward_bf16(
+    return dflash::common::flashprefill::flash_prefill_forward_bf16(
         Q, K, V, O,
         batch, seq_len, n_q_heads, n_k_heads, head_dim,
         scale, cfg);
diff --git a/dflash/src/qwen3/qwen3_backend.cpp b/dflash/src/qwen3/qwen3_backend.cpp
index d95b41413..58c476d42 100644
--- a/dflash/src/qwen3/qwen3_backend.cpp
+++ b/dflash/src/qwen3/qwen3_backend.cpp
@@ -20,7 +20,7 @@
 #include <cmath>
 #include <sstream>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── Cache management ───────────────────────────────────────────────────
 
@@ -1031,7 +1031,7 @@ bool Qwen3Backend::handle_compress(const std::string & line, const DaemonIO & io
 
 void Qwen3Backend::free_drafter() {
     if (drafter_loaded_) {
-        dflash27b::free_drafter(drafter_ctx_);
+        dflash::common::free_drafter(drafter_ctx_);
         drafter_loaded_ = false;
     }
 }
@@ -1060,4 +1060,4 @@ void Qwen3Backend::shutdown() {
     }
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen3/qwen3_backend.h b/dflash/src/qwen3/qwen3_backend.h
index bae81f52c..7d8bd0b0c 100644
--- a/dflash/src/qwen3/qwen3_backend.h
+++ b/dflash/src/qwen3/qwen3_backend.h
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct Qwen3BackendConfig {
     const char *    model_path = nullptr;
@@ -144,4 +144,4 @@ class Qwen3Backend : public ModelBackend {
     std::vector<float> last_logits_;  // logits from last prefill chunk
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen3/qwen3_daemon.cpp b/dflash/src/qwen3/qwen3_daemon.cpp
index 9f478995e..f5fd59132 100644
--- a/dflash/src/qwen3/qwen3_daemon.cpp
+++ b/dflash/src/qwen3/qwen3_daemon.cpp
@@ -6,7 +6,7 @@
 
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 int run_qwen3_daemon(const Qwen3DaemonArgs & args) {
     Qwen3BackendConfig cfg;
@@ -26,4 +26,4 @@ int run_qwen3_daemon(const Qwen3DaemonArgs & args) {
     return run_daemon(backend, dargs);
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen3/qwen3_daemon.h b/dflash/src/qwen3/qwen3_daemon.h
index 1c457d427..2478a4c23 100644
--- a/dflash/src/qwen3/qwen3_daemon.h
+++ b/dflash/src/qwen3/qwen3_daemon.h
@@ -5,7 +5,7 @@
 #include "device_placement.h"
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct Qwen3DaemonArgs {
     const char *    model_path = nullptr;
@@ -17,4 +17,4 @@ struct Qwen3DaemonArgs {
 
 int run_qwen3_daemon(const Qwen3DaemonArgs & args);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen3/qwen3_drafter.cpp b/dflash/src/qwen3/qwen3_drafter.cpp
index 8f242eb54..c5e31fb3e 100644
--- a/dflash/src/qwen3/qwen3_drafter.cpp
+++ b/dflash/src/qwen3/qwen3_drafter.cpp
@@ -30,7 +30,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 namespace {
 
@@ -795,4 +795,4 @@ std::vector<int32_t> drafter_score_and_compress(
     return out;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen3/qwen3_drafter.h b/dflash/src/qwen3/qwen3_drafter.h
index b0687dadb..84b028a8f 100644
--- a/dflash/src/qwen3/qwen3_drafter.h
+++ b/dflash/src/qwen3/qwen3_drafter.h
@@ -23,7 +23,7 @@
 struct ggml_backend;
 typedef struct ggml_backend * ggml_backend_t;
 
-namespace dflash27b {
+namespace dflash::common {
 
 enum class DrafterArch {
     Qwen3_0p6b,
@@ -76,4 +76,4 @@ std::vector<int32_t> drafter_score_and_compress(
     int    n_lookahead = 8,
     int    pool_kernel = 13);
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen3/qwen3_drafter_model.h b/dflash/src/qwen3/qwen3_drafter_model.h
index bc0664e06..8a1b2d709 100644
--- a/dflash/src/qwen3/qwen3_drafter_model.h
+++ b/dflash/src/qwen3/qwen3_drafter_model.h
@@ -25,7 +25,7 @@ typedef struct ggml_backend * ggml_backend_t;
 struct ggml_backend_buffer;
 typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct Qwen3DrafterLayer {
     ggml_tensor * attn_norm   = nullptr;  // [hidden]
@@ -89,4 +89,4 @@ bool forward_qwen3_drafter_model(
     int n_lookahead,
     std::vector<float> & running_max);
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen3/qwen3_graph.cpp b/dflash/src/qwen3/qwen3_graph.cpp
index c3c1238f5..c907546f3 100644
--- a/dflash/src/qwen3/qwen3_graph.cpp
+++ b/dflash/src/qwen3/qwen3_graph.cpp
@@ -51,7 +51,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 namespace {
 
@@ -865,4 +865,4 @@ bool forward_qwen3_drafter_model(
     return true;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen3/qwen3_loader.cpp b/dflash/src/qwen3/qwen3_loader.cpp
index 5ab6138eb..009ff78d7 100644
--- a/dflash/src/qwen3/qwen3_loader.cpp
+++ b/dflash/src/qwen3/qwen3_loader.cpp
@@ -42,7 +42,7 @@
 #include <unistd.h>
 #endif
 
-namespace dflash27b {
+namespace dflash::common {
 
 namespace {
 
@@ -250,4 +250,4 @@ void free_qwen3_drafter_model(Qwen3DrafterWeights & w) {
     w.backend = nullptr;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen35/gguf_target_loader.cpp b/dflash/src/qwen35/gguf_target_loader.cpp
index 1499325de..62e209ebf 100644
--- a/dflash/src/qwen35/gguf_target_loader.cpp
+++ b/dflash/src/qwen35/gguf_target_loader.cpp
@@ -61,7 +61,7 @@
 #include <unistd.h>
 #endif
 
-namespace dflash27b {
+namespace dflash::common {
 
 // CpuEmbedder destructor + embed() method
 CpuEmbedder::~CpuEmbedder() {
@@ -683,4 +683,4 @@ void free_target_weights(TargetWeights & w) {
     w.output   = nullptr;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen35/graph_builders.cpp b/dflash/src/qwen35/graph_builders.cpp
index 035ffb61e..c1f51cdbf 100644
--- a/dflash/src/qwen35/graph_builders.cpp
+++ b/dflash/src/qwen35/graph_builders.cpp
@@ -4,7 +4,7 @@
 
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── build_layer_step ────────────────────────────────────────────
 
@@ -260,4 +260,4 @@ bool build_lm_head_projection_step(
     return ggml_gallocr_alloc_graph(sg.alloc, sg.gf);
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/graph_builders.h b/dflash/src/qwen35/graph_builders.h
index e820f4376..323e8e3d2 100644
--- a/dflash/src/qwen35/graph_builders.h
+++ b/dflash/src/qwen35/graph_builders.h
@@ -21,7 +21,7 @@
 #include "ggml.h"
 #include "ggml-backend.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Layer-segmented prefill: process one target layer for chunk_start..chunk_start+n_tokens.
 bool build_layer_step(
@@ -73,4 +73,4 @@ bool build_lm_head_projection_step(
     ggml_backend_t backend,
     int n_tokens);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/layer_split_daemon.cpp b/dflash/src/qwen35/layer_split_daemon.cpp
index 086e19dc6..c8c107bf9 100644
--- a/dflash/src/qwen35/layer_split_daemon.cpp
+++ b/dflash/src/qwen35/layer_split_daemon.cpp
@@ -12,7 +12,7 @@
 #include <cstdlib>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 bool run_target_layer_split_request(
         std::vector<TargetLayerSplitShard> & shards,
@@ -87,4 +87,4 @@ bool run_target_layer_split_request(
     return true;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen35/layer_split_daemon.h b/dflash/src/qwen35/layer_split_daemon.h
index c58785248..3d87a3313 100644
--- a/dflash/src/qwen35/layer_split_daemon.h
+++ b/dflash/src/qwen35/layer_split_daemon.h
@@ -16,7 +16,7 @@
 #include <cstdint>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Handle one inference request over a set of layer-split shards.
 // Runs prefill, then either spec-decode (if run_dflash && draft available)
@@ -38,4 +38,4 @@ bool run_target_layer_split_request(
         int draft_ctx_max,
         int stream_fd = -1);
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen35/layer_split_daemon_loop.cpp b/dflash/src/qwen35/layer_split_daemon_loop.cpp
index 247971e6b..6790f5510 100644
--- a/dflash/src/qwen35/layer_split_daemon_loop.cpp
+++ b/dflash/src/qwen35/layer_split_daemon_loop.cpp
@@ -20,7 +20,7 @@
 #include <random>
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 int run_layer_split_daemon(const LayerSplitDaemonConfig & cfg) {
     const auto info = inspect_gguf_model_info(cfg.target_path);
@@ -192,4 +192,4 @@ int run_layer_split_daemon(const LayerSplitDaemonConfig & cfg) {
     return 0;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/layer_split_daemon_loop.h b/dflash/src/qwen35/layer_split_daemon_loop.h
index 015ccad4a..2215f00ba 100644
--- a/dflash/src/qwen35/layer_split_daemon_loop.h
+++ b/dflash/src/qwen35/layer_split_daemon_loop.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct LayerSplitDaemonConfig {
     const char * target_path = nullptr;
@@ -43,4 +43,4 @@ struct LayerSplitDaemonConfig {
 // Run the layer-split daemon event loop. Returns exit code (0 = success).
 int run_layer_split_daemon(const LayerSplitDaemonConfig & cfg);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/layer_split_forward.cpp b/dflash/src/qwen35/layer_split_forward.cpp
index 5f6b3f9f7..985be9502 100644
--- a/dflash/src/qwen35/layer_split_forward.cpp
+++ b/dflash/src/qwen35/layer_split_forward.cpp
@@ -15,7 +15,7 @@
 #include <cstdio>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 bool compute_target_split_argmax(
         StepGraph & sg,
@@ -233,4 +233,4 @@ void free_target_layer_split_shards(std::vector<TargetLayerSplitShard> & shards)
     shards.clear();
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen35/layer_split_forward.h b/dflash/src/qwen35/layer_split_forward.h
index cc032c1f3..7bf0b9222 100644
--- a/dflash/src/qwen35/layer_split_forward.h
+++ b/dflash/src/qwen35/layer_split_forward.h
@@ -17,7 +17,7 @@
 #include <cstdint>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Compute argmax(logits) for a slice of the activation tensor via
 // out_norm + lm_head projection.
@@ -52,4 +52,4 @@ bool run_target_layer_split_forward(
 // Free all shards (weights, cache, backend).
 void free_target_layer_split_shards(std::vector<TargetLayerSplitShard> & shards);
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen35/layer_split_types.h b/dflash/src/qwen35/layer_split_types.h
index 3e4929251..3920d6f3c 100644
--- a/dflash/src/qwen35/layer_split_types.h
+++ b/dflash/src/qwen35/layer_split_types.h
@@ -17,7 +17,7 @@
 #include <cstdio>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── Per-GPU shard for layer-split target ────────────────────────────
 
@@ -41,4 +41,4 @@ inline TargetLayerSplitShard * find_target_shard(
     return nullptr;
 }
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_backend.cpp b/dflash/src/qwen35/qwen35_backend.cpp
index ecd8f745e..8b08d69e6 100644
--- a/dflash/src/qwen35/qwen35_backend.cpp
+++ b/dflash/src/qwen35/qwen35_backend.cpp
@@ -20,7 +20,7 @@
 #include <cstdlib>
 #include <cstring>
 
-namespace dflash27b {
+namespace dflash::common {
 
 #define IS_EOS_TOK(tok, w)                                         \
     ( ((w).eos_chat_id >= 0 && (tok) == (w).eos_chat_id)                  \
@@ -370,7 +370,7 @@ bool Qwen35Backend::handle_compress(const std::string & line, const DaemonIO & i
 void Qwen35Backend::free_drafter() {
     if (drafter_loaded_) {
         // Drafter has its own backend — do a full free (weights + backend)
-        dflash27b::free_drafter(drafter_ctx_);
+        dflash::common::free_drafter(drafter_ctx_);
         drafter_loaded_ = false;
         std::printf("[drafter] freed\n"); std::fflush(stdout);
     }
@@ -975,4 +975,4 @@ int Qwen35Backend::verify_tree(int committed, const DDTree & tree) {
     return 0;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_backend.h b/dflash/src/qwen35/qwen35_backend.h
index 18e444b2c..d87f5f0b8 100644
--- a/dflash/src/qwen35/qwen35_backend.h
+++ b/dflash/src/qwen35/qwen35_backend.h
@@ -28,7 +28,7 @@
 #include <string>
 #include <cstddef>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ── Configuration passed at construction ────────────────────────────────
 
@@ -182,4 +182,4 @@ class Qwen35Backend : public ModelBackend {
     int verify_tree(int committed, const DDTree & tree);
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_daemon.cpp b/dflash/src/qwen35/qwen35_daemon.cpp
index b67849e6b..d1a14a915 100644
--- a/dflash/src/qwen35/qwen35_daemon.cpp
+++ b/dflash/src/qwen35/qwen35_daemon.cpp
@@ -10,7 +10,7 @@
 
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 int run_qwen35_daemon(const Qwen35DaemonArgs & args) {
     Qwen35Config cfg;
@@ -42,4 +42,4 @@ int run_qwen35_daemon(const Qwen35DaemonArgs & args) {
     return run_daemon(backend, dargs);
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_daemon.h b/dflash/src/qwen35/qwen35_daemon.h
index 52786b577..2135eca3f 100644
--- a/dflash/src/qwen35/qwen35_daemon.h
+++ b/dflash/src/qwen35/qwen35_daemon.h
@@ -8,7 +8,7 @@
 #include "device_placement.h"
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct Qwen35DaemonArgs {
     const char * target_path    = nullptr;
@@ -39,4 +39,4 @@ struct Qwen35DaemonArgs {
 // Run the qwen35 daemon loop. Returns 0 on clean exit, 1 on init failure.
 int run_qwen35_daemon(const Qwen35DaemonArgs & args);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_dflash_target.cpp b/dflash/src/qwen35/qwen35_dflash_target.cpp
index 49d13e327..65713d1bb 100644
--- a/dflash/src/qwen35/qwen35_dflash_target.cpp
+++ b/dflash/src/qwen35/qwen35_dflash_target.cpp
@@ -5,7 +5,7 @@
 #include "step_graph.h"
 #include "attn_masks.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 Qwen35DFlashTarget::~Qwen35DFlashTarget() {
     step_graph_destroy(proj_sg_);
@@ -149,4 +149,4 @@ const std::vector<int> & Qwen35DFlashTarget::capture_layer_ids() const {
     return capture_ids_;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_dflash_target.h b/dflash/src/qwen35/qwen35_dflash_target.h
index ec2d609c6..6a72e48b5 100644
--- a/dflash/src/qwen35/qwen35_dflash_target.h
+++ b/dflash/src/qwen35/qwen35_dflash_target.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 class Qwen35DFlashTarget : public DFlashTarget {
 public:
@@ -68,4 +68,4 @@ class Qwen35DFlashTarget : public DFlashTarget {
     StepGraph proj_sg_;
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_layer_split.h b/dflash/src/qwen35/qwen35_layer_split.h
index 2fb487654..1d3358043 100644
--- a/dflash/src/qwen35/qwen35_layer_split.h
+++ b/dflash/src/qwen35/qwen35_layer_split.h
@@ -17,7 +17,7 @@
 
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct Qwen35LayerSplitDaemonArgs {
     const char * target_path = nullptr;
@@ -35,4 +35,4 @@ struct Qwen35LayerSplitDaemonArgs {
 // Will be fully implemented here once helpers are extracted.
 int run_qwen35_layer_split_daemon(const Qwen35LayerSplitDaemonArgs & args);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp b/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp
index 39d87c273..76daca64f 100644
--- a/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp
+++ b/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp
@@ -6,7 +6,7 @@
 #include "graph_builders.h"
 #include "step_graph.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 Qwen35LayerSplitDFlashTarget::~Qwen35LayerSplitDFlashTarget() {
     step_graph_destroy(proj_sg_);
@@ -104,4 +104,4 @@ const std::vector<int> & Qwen35LayerSplitDFlashTarget::capture_layer_ids() const
     return capture_ids_;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_layer_split_dflash_target.h b/dflash/src/qwen35/qwen35_layer_split_dflash_target.h
index f6f149b16..f593009d8 100644
--- a/dflash/src/qwen35/qwen35_layer_split_dflash_target.h
+++ b/dflash/src/qwen35/qwen35_layer_split_dflash_target.h
@@ -21,7 +21,7 @@
 
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 class Qwen35LayerSplitDFlashTarget : public DFlashTarget {
 public:
@@ -66,4 +66,4 @@ class Qwen35LayerSplitDFlashTarget : public DFlashTarget {
     StepGraph        proj_sg_;
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/qwen35/qwen35_target_graph.cpp b/dflash/src/qwen35/qwen35_target_graph.cpp
index 9c7871632..fdb3a9141 100644
--- a/dflash/src/qwen35/qwen35_target_graph.cpp
+++ b/dflash/src/qwen35/qwen35_target_graph.cpp
@@ -39,7 +39,7 @@
 #include <cstdlib>
 #include <cstring>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ─── Local qwen35 constants (from the GGUF, hardcoded for this model) ─
 // These complement the DFLASH27B_* macros in dflash27b.h with qwen35-specific
@@ -1544,4 +1544,4 @@ bool restore_target_cache_chain(const PrefixSnapshot * thick,
 }
 
 
-} // namespace dflash27b
+} // namespace dflash::common
diff --git a/dflash/src/server/api_types.h b/dflash/src/server/api_types.h
index e5803bf03..9fa187997 100644
--- a/dflash/src/server/api_types.h
+++ b/dflash/src/server/api_types.h
@@ -1,8 +1,8 @@
 // Shared types for the server components.
 #pragma once
 
-namespace dflash27b {
+namespace dflash::common {
 
 enum class ApiFormat { OPENAI_CHAT, ANTHROPIC, RESPONSES, COMPLETIONS };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/chat_template.cpp b/dflash/src/server/chat_template.cpp
index 758502888..92c465880 100644
--- a/dflash/src/server/chat_template.cpp
+++ b/dflash/src/server/chat_template.cpp
@@ -2,7 +2,7 @@
 
 #include "chat_template.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Qwen3.5 tool preamble — matches the official Jinja template exactly.
 static const char QWEN3_TOOL_PREAMBLE[] =
@@ -155,4 +155,4 @@ std::string render_chat_template(
     return result;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/chat_template.h b/dflash/src/server/chat_template.h
index d6c34b254..5f35f4925 100644
--- a/dflash/src/server/chat_template.h
+++ b/dflash/src/server/chat_template.h
@@ -1,4 +1,4 @@
-// Chat template renderer for dflash27b native server.
+// Chat template renderer for dflash::common native server.
 //
 // Renders chat messages (system/user/assistant/tool) into the model-specific
 // token format. Hard-coded for supported architectures:
@@ -10,7 +10,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // A single message in a chat conversation.
 struct ChatMessage {
@@ -49,4 +49,4 @@ std::string render_chat_template(
 // Detect the appropriate chat format for an architecture.
 ChatFormat chat_format_for_arch(const std::string & arch);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/disk_prefix_cache.cpp b/dflash/src/server/disk_prefix_cache.cpp
index c60d5ca76..ca62469fc 100644
--- a/dflash/src/server/disk_prefix_cache.cpp
+++ b/dflash/src/server/disk_prefix_cache.cpp
@@ -16,7 +16,7 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ─── Inline SHA-1 (same as prefix_cache.cpp) ────────────────────────────
 
@@ -729,4 +729,4 @@ bool DiskPrefixCache::read_header(FILE * f, DiskCacheHeader & hdr) {
     return true;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/disk_prefix_cache.h b/dflash/src/server/disk_prefix_cache.h
index 508f5a07f..d4bcd7d49 100644
--- a/dflash/src/server/disk_prefix_cache.h
+++ b/dflash/src/server/disk_prefix_cache.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ─── Configuration ──────────────────────────────────────────────────────
 
@@ -166,4 +166,4 @@ class DiskPrefixCache {
     static bool read_header(FILE * f, DiskCacheHeader & hdr);
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/http_server.cpp b/dflash/src/server/http_server.cpp
index 57f27ed80..5d61da306 100644
--- a/dflash/src/server/http_server.cpp
+++ b/dflash/src/server/http_server.cpp
@@ -21,7 +21,7 @@
 #include <sys/socket.h>
 #include <unistd.h>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ─── Utilities ──────────────────────────────────────────────────────────
 
@@ -1086,4 +1086,4 @@ bool HttpServer::send_sse_headers(int fd) {
     return send_all(fd, header.data(), header.size());
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/http_server.h b/dflash/src/server/http_server.h
index c879d608d..8c0ec9eb7 100644
--- a/dflash/src/server/http_server.h
+++ b/dflash/src/server/http_server.h
@@ -1,4 +1,4 @@
-// HTTP server infrastructure for dflash27b native server.
+// HTTP server infrastructure for dflash::common native server.
 //
 // Ported from ds4_server.c's socket/threading/HTTP layer, converted to C++.
 // Architecture:
@@ -32,7 +32,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 using json = nlohmann::json;
 
@@ -186,4 +186,4 @@ struct ServerJob {
     ServerJob *   next = nullptr;
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/prefix_cache.cpp b/dflash/src/server/prefix_cache.cpp
index 6765b97ad..72ceae720 100644
--- a/dflash/src/server/prefix_cache.cpp
+++ b/dflash/src/server/prefix_cache.cpp
@@ -7,7 +7,7 @@
 #include <cstring>
 #include <chrono>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ─── Chat marker resolution ────────────────────────────────────────────
 
@@ -424,4 +424,4 @@ void PrefixCache::abort_full_snap(int /*slot*/) {
     full_has_pending_evict_ = false;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/prefix_cache.h b/dflash/src/server/prefix_cache.h
index a3451ee97..cb0c551bd 100644
--- a/dflash/src/server/prefix_cache.h
+++ b/dflash/src/server/prefix_cache.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ─── Chat marker detection ──────────────────────────────────────────────
 
@@ -142,4 +142,4 @@ class PrefixCache {
     void move_full_to_end(int idx);
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/reasoning.cpp b/dflash/src/server/reasoning.cpp
index 3b084e5e3..aec8912a8 100644
--- a/dflash/src/server/reasoning.cpp
+++ b/dflash/src/server/reasoning.cpp
@@ -2,7 +2,7 @@
 
 #include "reasoning.h"
 
-namespace dflash27b {
+namespace dflash::common {
 
 static const char THINK_OPEN[]  = "<think>";
 static const char THINK_CLOSE[] = "</think>";
@@ -85,4 +85,4 @@ ReasoningResult parse_reasoning(const std::string & text,
     return result;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/reasoning.h b/dflash/src/server/reasoning.h
index 95ea3046e..a7c1f8673 100644
--- a/dflash/src/server/reasoning.h
+++ b/dflash/src/server/reasoning.h
@@ -7,7 +7,7 @@
 #include <string>
 #include <utility>
 
-namespace dflash27b {
+namespace dflash::common {
 
 struct ReasoningResult {
     std::string content;    // cleaned content (think tags removed)
@@ -22,4 +22,4 @@ ReasoningResult parse_reasoning(const std::string & text,
                                 bool thinking_enabled = true,
                                 bool started_in_thinking = false);
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/server_main.cpp b/dflash/src/server/server_main.cpp
index d1ba7cdc3..319f97ded 100644
--- a/dflash/src/server/server_main.cpp
+++ b/dflash/src/server/server_main.cpp
@@ -1,4 +1,4 @@
-// dflash_server — native C++ HTTP server for dflash27b.
+// dflash_server — native C++ HTTP server for dflash::common.
 //
 // Replaces the Python server.py for production use. Owns the ModelBackend
 // directly (no subprocess, no pipe protocol), enabling:
@@ -22,7 +22,7 @@
 #include <memory>
 #include <string>
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 // Global server pointer for signal handling.
 static HttpServer * g_server = nullptr;
diff --git a/dflash/src/server/sse_emitter.cpp b/dflash/src/server/sse_emitter.cpp
index 26f4158ce..59a5fe7d0 100644
--- a/dflash/src/server/sse_emitter.cpp
+++ b/dflash/src/server/sse_emitter.cpp
@@ -7,7 +7,7 @@
 #include <chrono>
 #include <cstdio>
 
-namespace dflash27b {
+namespace dflash::common {
 
 static const char THINK_OPEN[]  = "<think>";
 static const char THINK_CLOSE[] = "</think>";
@@ -563,4 +563,4 @@ std::string SseEmitter::finish_reason() const {
     return tool_calls_.empty() ? "stop" : "tool_calls";
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/sse_emitter.h b/dflash/src/server/sse_emitter.h
index 0212ec878..db5afc220 100644
--- a/dflash/src/server/sse_emitter.h
+++ b/dflash/src/server/sse_emitter.h
@@ -16,7 +16,7 @@
 #include <string>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 using json = nlohmann::json;
 
@@ -108,4 +108,4 @@ class SseEmitter {
     static constexpr size_t HOLDBACK = 12;  // max(len("<tool_call>"), len("</think>"), len("<think>"))
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/tokenizer.cpp b/dflash/src/server/tokenizer.cpp
index 5619d69f2..1f538682b 100644
--- a/dflash/src/server/tokenizer.cpp
+++ b/dflash/src/server/tokenizer.cpp
@@ -16,7 +16,7 @@
 #include <limits>
 #include <utility>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ─── Unicode helpers ────────────────────────────────────────────────────
 
@@ -714,4 +714,4 @@ int32_t Tokenizer::token_to_id(const std::string & token) const {
     return it != token_to_id_.end() ? it->second : -1;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/tokenizer.h b/dflash/src/server/tokenizer.h
index ff516f28c..f28dfd9fc 100644
--- a/dflash/src/server/tokenizer.h
+++ b/dflash/src/server/tokenizer.h
@@ -1,4 +1,4 @@
-// BPE tokenizer for dflash27b native server.
+// BPE tokenizer for dflash::common native server.
 //
 // Loads vocabulary (token strings) and merge rules from GGUF metadata,
 // then provides encode (text → token IDs) and decode (token IDs → text).
@@ -14,7 +14,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 class Tokenizer {
 public:
@@ -82,4 +82,4 @@ class Tokenizer {
     PreTokenizer pre_type_ = PreTokenizer::QWEN35;
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/tool_memory.cpp b/dflash/src/server/tool_memory.cpp
index 75b1f63ed..1d54f11f7 100644
--- a/dflash/src/server/tool_memory.cpp
+++ b/dflash/src/server/tool_memory.cpp
@@ -4,7 +4,7 @@
 
 #include <algorithm>
 
-namespace dflash27b {
+namespace dflash::common {
 
 ToolMemory::ToolMemory(size_t max_entries, size_t max_bytes)
     : max_entries_(max_entries)
@@ -136,4 +136,4 @@ void ToolMemory::prune() {
     }
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/tool_memory.h b/dflash/src/server/tool_memory.h
index c9167c270..0ff92ac0e 100644
--- a/dflash/src/server/tool_memory.h
+++ b/dflash/src/server/tool_memory.h
@@ -14,7 +14,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace dflash27b {
+namespace dflash::common {
 
 class ToolMemory {
 public:
@@ -55,4 +55,4 @@ class ToolMemory {
     std::unordered_map<std::string, std::list<std::string>::iterator> lru_map_;
 };
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/tool_parser.cpp b/dflash/src/server/tool_parser.cpp
index 44da48a51..18fb74263 100644
--- a/dflash/src/server/tool_parser.cpp
+++ b/dflash/src/server/tool_parser.cpp
@@ -16,7 +16,7 @@
 #include <regex>
 #include <sstream>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // ─── Helpers ────────────────────────────────────────────────────────────
 
@@ -465,4 +465,4 @@ ToolParseResult parse_tool_calls(const std::string & text, const json & tools) {
     return result;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/tool_parser.h b/dflash/src/server/tool_parser.h
index 1429e6763..e891104bc 100644
--- a/dflash/src/server/tool_parser.h
+++ b/dflash/src/server/tool_parser.h
@@ -15,7 +15,7 @@
 #include <vector>
 #include <utility>
 
-namespace dflash27b {
+namespace dflash::common {
 
 using json = nlohmann::json;
 
@@ -35,4 +35,4 @@ struct ToolParseResult {
 ToolParseResult parse_tool_calls(const std::string & text,
                                  const json & tools = json());
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/src/server/utf8_utils.h b/dflash/src/server/utf8_utils.h
index a40b04cc3..909107d51 100644
--- a/dflash/src/server/utf8_utils.h
+++ b/dflash/src/server/utf8_utils.h
@@ -9,7 +9,7 @@
 #include <cstdint>
 #include <string>
 
-namespace dflash27b {
+namespace dflash::common {
 
 // Snap a byte offset back to a UTF-8 code-point boundary.
 // Returns the largest position <= `pos` that doesn't split a multi-byte sequence.
@@ -74,4 +74,4 @@ inline std::string utf8_sanitize(const std::string & s) {
     return out;
 }
 
-}  // namespace dflash27b
+}  // namespace dflash::common
diff --git a/dflash/test/bench_laguna_generate.cpp b/dflash/test/bench_laguna_generate.cpp
index 559db5695..cc068a163 100644
--- a/dflash/test/bench_laguna_generate.cpp
+++ b/dflash/test/bench_laguna_generate.cpp
@@ -24,7 +24,7 @@
 #include "ggml-backend.h"
 #include "ggml-cuda.h"
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 // Forward step lives in src/laguna_target_graph.cpp::laguna_step(). The
 // bench just times prefill + decode loops on top of it.
diff --git a/dflash/test/bench_laguna_pflash.cpp b/dflash/test/bench_laguna_pflash.cpp
index b4df05fbd..46b2311e0 100644
--- a/dflash/test/bench_laguna_pflash.cpp
+++ b/dflash/test/bench_laguna_pflash.cpp
@@ -27,7 +27,7 @@
 #include "ggml-backend.h"
 #include "ggml-cuda.h"
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 // Chunked prefill loop on top of the shared laguna_step() helper. Reports
 // total prefill time and the argmax / logit at the LAST chunk.
diff --git a/dflash/test/bench_laguna_ttft.cpp b/dflash/test/bench_laguna_ttft.cpp
index 8aec1613d..19be95c83 100644
--- a/dflash/test/bench_laguna_ttft.cpp
+++ b/dflash/test/bench_laguna_ttft.cpp
@@ -29,7 +29,7 @@
 #include "ggml-backend.h"
 #include "ggml-cuda.h"
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 static std::vector<int> parse_csv(const std::string & s, std::vector<int> dflt) {
     if (s.empty()) return dflt;
diff --git a/dflash/test/pflash_daemon.cpp b/dflash/test/pflash_daemon.cpp
index d19591e5c..38e291e8a 100644
--- a/dflash/test/pflash_daemon.cpp
+++ b/dflash/test/pflash_daemon.cpp
@@ -30,7 +30,7 @@
 #include <unistd.h>
 #endif
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 static std::vector<int32_t> read_counted_i32_file(const std::string & path) {
     std::ifstream f(path, std::ios::binary);
diff --git a/dflash/test/smoke_draft_graph.cpp b/dflash/test/smoke_draft_graph.cpp
index b64cb7a36..544f8b51f 100644
--- a/dflash/test/smoke_draft_graph.cpp
+++ b/dflash/test/smoke_draft_graph.cpp
@@ -30,7 +30,7 @@
 #include <random>
 #include <vector>
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 // Convert fp32 -> bf16 (truncation)
 static uint16_t f32_to_bf16(float f) {
diff --git a/dflash/test/smoke_laguna_forward.cpp b/dflash/test/smoke_laguna_forward.cpp
index d92c6fc35..dbf340194 100644
--- a/dflash/test/smoke_laguna_forward.cpp
+++ b/dflash/test/smoke_laguna_forward.cpp
@@ -22,7 +22,7 @@
 #include "ggml-cuda.h"
 #include "ggml-alloc.h"
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 int main(int argc, char ** argv) {
     if (argc < 2) {
diff --git a/dflash/test/smoke_load_draft.cpp b/dflash/test/smoke_load_draft.cpp
index 983e44231..9e0f37b42 100644
--- a/dflash/test/smoke_load_draft.cpp
+++ b/dflash/test/smoke_load_draft.cpp
@@ -17,7 +17,7 @@
 #include <cstring>
 #include <vector>
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 int main(int argc, char ** argv) {
     if (argc < 2) {
diff --git a/dflash/test/smoke_load_target.cpp b/dflash/test/smoke_load_target.cpp
index dea06dbbc..595c13530 100644
--- a/dflash/test/smoke_load_target.cpp
+++ b/dflash/test/smoke_load_target.cpp
@@ -16,7 +16,7 @@
 #include <cstring>
 #include <vector>
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 int main(int argc, char ** argv) {
     if (argc < 2) {
diff --git a/dflash/test/smoke_load_target_laguna.cpp b/dflash/test/smoke_load_target_laguna.cpp
index 6da68a4e0..c426cc1ee 100644
--- a/dflash/test/smoke_load_target_laguna.cpp
+++ b/dflash/test/smoke_load_target_laguna.cpp
@@ -15,7 +15,7 @@
 #include "ggml-backend.h"
 #include "ggml-cuda.h"
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 int main(int argc, char ** argv) {
     if (argc < 2) {
diff --git a/dflash/test/smoke_qwen3_forward.cpp b/dflash/test/smoke_qwen3_forward.cpp
index 7d46748e9..4efa41781 100644
--- a/dflash/test/smoke_qwen3_forward.cpp
+++ b/dflash/test/smoke_qwen3_forward.cpp
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 int main(int argc, char ** argv) {
     if (argc < 3) {
diff --git a/dflash/test/smoke_target_forward.cpp b/dflash/test/smoke_target_forward.cpp
index dcb61390d..285526ddb 100644
--- a/dflash/test/smoke_target_forward.cpp
+++ b/dflash/test/smoke_target_forward.cpp
@@ -25,7 +25,7 @@
 #include <cstring>
 #include <vector>
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 int main(int argc, char ** argv) {
     if (argc < 2) {
diff --git a/dflash/test/test_dflash.cpp b/dflash/test/test_dflash.cpp
index 9dc38e8d4..32ed24e20 100644
--- a/dflash/test/test_dflash.cpp
+++ b/dflash/test/test_dflash.cpp
@@ -24,7 +24,7 @@
 #include "qwen3_drafter.h"
 #include "gpu_runtime_compat.h"
 #include "laguna_daemon.h"  // arch dispatch - laguna targets are served by
-                            // dflash27b::run_laguna_daemon() instead of the
+                            // dflash::common::run_laguna_daemon() instead of the
                             // qwen35 + DFlash + DDTree pipeline below.
 #include "qwen35_daemon.h"  // arch dispatch - single-GPU qwen35 daemon mode
 #include "qwen35_layer_split.h" // multi-GPU layer-split daemon args
@@ -92,7 +92,7 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);
 #include <random>
 #include <unordered_set>
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 static SamplerCfg      g_sampler;
 static std::mt19937_64 g_sampler_rng{std::random_device{}()};
@@ -110,16 +110,16 @@ static std::mt19937_64 g_sampler_rng{std::random_device{}()};
 
 // ─── Small utilities — extracted to src/common/io_utils.h ──────────
 #include "io_utils.h"
-using dflash27b::read_int32_file;
-using dflash27b::write_int32_file;
-using dflash27b::stream_emit_fd;
-using dflash27b::argmax_f32;
-using dflash27b::write_binary_file;
-using dflash27b::read_binary_file_exact;
-using dflash27b::read_line_tail;
+using dflash::common::read_int32_file;
+using dflash::common::write_int32_file;
+using dflash::common::stream_emit_fd;
+using dflash::common::argmax_f32;
+using dflash::common::write_binary_file;
+using dflash::common::read_binary_file_exact;
+using dflash::common::read_line_tail;
 #if !defined(_WIN32)
-using dflash27b::read_exact_fd;
-using dflash27b::write_exact_fd;
+using dflash::common::read_exact_fd;
+using dflash::common::write_exact_fd;
 #endif
 
 // CPU sampler chain (SamplerCfg / sample_logits / parse_sampler_token) lives
@@ -134,12 +134,12 @@ using dflash27b::write_exact_fd;
 // The global `g_kq_stride_pad` below is set at init time and forwarded to
 // build_causal_mask / build_tree_mask (now in src/qwen35/attn_masks.h).
 #include "attn_masks.h"
-using dflash27b::KQ_MASK_PAD;
-using dflash27b::F16_ZERO;
-using dflash27b::F16_NEG_INF;
-using dflash27b::align_up;
-using dflash27b::build_causal_mask;
-using dflash27b::build_tree_mask;
+using dflash::common::KQ_MASK_PAD;
+using dflash::common::F16_ZERO;
+using dflash::common::F16_NEG_INF;
+using dflash::common::align_up;
+using dflash::common::build_causal_mask;
+using dflash::common::build_tree_mask;
 static int g_kq_stride_pad = KQ_MASK_PAD;   // overridden to 256 when TBQ KV is active
 static int g_max_ctx_override = 0;           // overridden by --max-ctx=N (default 4096)
 static int g_fa_window       = 2048;         // overridden by DFLASH27B_FA_WINDOW=N
@@ -150,50 +150,50 @@ static int g_draft_ctx_max   = 4096;        // draft context cap; --draft-ctx-ma
 // Extracted to src/qwen35/ddtree.{h,cpp}. Provides DDTree struct,
 // extract_draft_topk(), build_ddtree(), follow_verified_tree().
 #include "ddtree.h"
-using dflash27b::DDTree;
-using dflash27b::extract_draft_topk;
-using dflash27b::build_ddtree;
-using dflash27b::follow_verified_tree;
+using dflash::common::DDTree;
+using dflash::common::extract_draft_topk;
+using dflash::common::build_ddtree;
+using dflash::common::follow_verified_tree;
 
 // ─── StepGraph — extracted to src/qwen35/step_graph.h ──
 #include "step_graph.h"
-using dflash27b::StepGraph;
-using dflash27b::step_graph_free;
-using dflash27b::step_graph_destroy;
+using dflash::common::StepGraph;
+using dflash::common::step_graph_free;
+using dflash::common::step_graph_destroy;
 
 // ─── Peer access + DraftFeatureMirror — extracted to src/qwen35/ ──
 #include "peer_access.h"
 #include "dflash_feature_ring.h"
-using dflash27b::g_peer_access_opt_in;
-using dflash27b::g_peer_pair_ok_cache;
-using dflash27b::enable_peer_access_one_way;
-using dflash27b::enable_peer_access_pair;
-using dflash27b::cross_device_peer_memcpy_ok;
-using dflash27b::copy_peer_async;
-using dflash27b::DraftFeatureMirror;
-using dflash27b::draft_feature_mirror_free;
-using dflash27b::draft_feature_mirror_init;
-using dflash27b::draft_feature_mirror_can_view;
-using dflash27b::draft_feature_mirror_sync_range;
-using dflash27b::draft_feature_mirror_sync_tail;
+using dflash::common::g_peer_access_opt_in;
+using dflash::common::g_peer_pair_ok_cache;
+using dflash::common::enable_peer_access_one_way;
+using dflash::common::enable_peer_access_pair;
+using dflash::common::cross_device_peer_memcpy_ok;
+using dflash::common::copy_peer_async;
+using dflash::common::DraftFeatureMirror;
+using dflash::common::draft_feature_mirror_free;
+using dflash::common::draft_feature_mirror_init;
+using dflash::common::draft_feature_mirror_can_view;
+using dflash::common::draft_feature_mirror_sync_range;
+using dflash::common::draft_feature_mirror_sync_tail;
 
 // ─── Graph builders — extracted to src/qwen35/graph_builders.{h,cpp} ──
 #include "graph_builders.h"
 #include "dflash_draft_graph.h"
-using dflash27b::build_layer_step;
-using dflash27b::build_target_step;
-using dflash27b::build_target_step_tree;
-using dflash27b::build_draft_step;
-using dflash27b::build_lm_head_projection_step;
+using dflash::common::build_layer_step;
+using dflash::common::build_target_step;
+using dflash::common::build_target_step_tree;
+using dflash::common::build_draft_step;
+using dflash::common::build_lm_head_projection_step;
 
 // ─── Layer split types — extracted to src/qwen35/layer_split_types.h ──
 #include "layer_split_types.h"
-using dflash27b::LayerSplitRuntimeConfig;
-using dflash27b::TargetLayerSplitShard;
-using dflash27b::ActivationPair;
-using dflash27b::activation_pair_free;
-using dflash27b::activation_pair_init;
-using dflash27b::find_target_shard;
+using dflash::common::LayerSplitRuntimeConfig;
+using dflash::common::TargetLayerSplitShard;
+using dflash::common::ActivationPair;
+using dflash::common::activation_pair_free;
+using dflash::common::activation_pair_init;
+using dflash::common::find_target_shard;
 
 static bool parse_int_list(const char * text, std::vector<int> & out) {
     out.clear();
@@ -229,39 +229,39 @@ static bool parse_float_list(const char * text, std::vector<double> & out) {
 
 // ─── Draft IPC — extracted to src/qwen35/draft_ipc.{h,cpp} ──
 #include "dflash_draft_ipc.h"
-using dflash27b::DFlashDraftIpcClient;
-using dflash27b::copy_capture_slice_to_remote_draft;
-using dflash27b::stream_status;
-using dflash27b::run_dflash_draft_ipc_daemon;
+using dflash::common::DFlashDraftIpcClient;
+using dflash::common::copy_capture_slice_to_remote_draft;
+using dflash::common::stream_status;
+using dflash::common::run_dflash_draft_ipc_daemon;
 
 // ─── GGUF inspection — extracted to src/common/gguf_inspect.{h,cpp} ──
 #include "gguf_inspect.h"
 
 // ─── Layer ranges — extracted to src/common/layer_split_utils.{h,cpp} ──
 #include "layer_split_utils.h"
-using dflash27b::compute_layer_ranges;
+using dflash::common::compute_layer_ranges;
 
 // ─── Feature copy helpers — extracted to src/qwen35/feature_copy.{h,cpp} ──
 #include "dflash_capture.h"
-using dflash27b::target_capture_index;
-using dflash27b::copy_capture_slice_to_draft_ring;
-using dflash27b::copy_feature_ring_range_to_tensor;
+using dflash::common::target_capture_index;
+using dflash::common::copy_capture_slice_to_draft_ring;
+using dflash::common::copy_feature_ring_range_to_tensor;
 
 // ─── Layer-split forward — extracted to src/qwen35/layer_split_forward.{h,cpp} ──
 #include "layer_split_forward.h"
-using dflash27b::compute_target_split_argmax;
-using dflash27b::run_target_layer_split_forward;
-using dflash27b::free_target_layer_split_shards;
+using dflash::common::compute_target_split_argmax;
+using dflash::common::run_target_layer_split_forward;
+using dflash::common::free_target_layer_split_shards;
 
 
 // ─── Speculative decode — generic loop in common/, qwen35 layer-split adapter.
 #include "qwen35_layer_split_dflash_target.h"
 #include "common/dflash_spec_decode.h"
-using dflash27b::is_eos_tok;
+using dflash::common::is_eos_tok;
 
 // ─── Layer-split daemon — extracted to src/qwen35/layer_split_daemon.{h,cpp} ─
 #include "layer_split_daemon.h"
-using dflash27b::run_target_layer_split_request;
+using dflash::common::run_target_layer_split_request;
 
 static int run_target_layer_split_daemon(
         const char * target_path,
@@ -318,7 +318,7 @@ static int run_target_layer_split_harness(
         std::fprintf(stderr, "target layer split requires prompt/n_gen/out positional args\n");
         return 2;
     }
-    const int n_layer = dflash27b::inspect_gguf_model_info(target_path).n_layer;
+    const int n_layer = dflash::common::inspect_gguf_model_info(target_path).n_layer;
     if (n_layer <= 0) {
         std::fprintf(stderr, "target-split could not read qwen35.block_count\n");
         return 1;
@@ -712,7 +712,7 @@ int main(int argc, char ** argv) {
     // shape so we can route laguna requests to run_laguna_daemon() and
     // accept the no-draft argv layout server.py uses for that arch.
     #include "gguf_inspect.h"
-    const auto model_info   = dflash27b::inspect_gguf_model_info(target_path);
+    const auto model_info   = dflash::common::inspect_gguf_model_info(target_path);
     const std::string detected_arch = model_info.arch;
     const bool is_laguna = (detected_arch == "laguna");
     const bool is_qwen3  = (detected_arch == "qwen3");
@@ -972,13 +972,13 @@ int main(int argc, char ** argv) {
             "[test_dflash] arch=laguna -> dispatching to run_laguna_daemon "
             "(max_ctx=%d kv=%s chunk=%d stream_fd=%d). DFlash + DDTree disabled.\n",
             max_ctx_eff, ggml_type_name(kv), chunk, stream_fd);
-        dflash27b::LagunaDaemonArgs largs;
+        dflash::common::LagunaDaemonArgs largs;
         largs.target_path     = target_path;
         largs.device.max_ctx  = max_ctx_eff;
         largs.chunk           = chunk;
         largs.kv_type         = kv;
         largs.stream_fd       = stream_fd;
-        return dflash27b::run_laguna_daemon(largs);
+        return dflash::common::run_laguna_daemon(largs);
     }
 
     // ---- Arch dispatch: qwen3 targets to the dedicated daemon -----
@@ -987,13 +987,13 @@ int main(int argc, char ** argv) {
         std::fprintf(stderr,
             "[test_dflash] arch=qwen3 -> dispatching to run_qwen3_daemon "
             "(max_ctx=%d stream_fd=%d)\n", max_ctx_eff, stream_fd);
-        dflash27b::Qwen3DaemonArgs q3args;
+        dflash::common::Qwen3DaemonArgs q3args;
         q3args.model_path     = target_path;
         q3args.device.gpu     = target_gpu;
         q3args.device.max_ctx = max_ctx_eff;
         q3args.stream_fd      = stream_fd;
         q3args.chunk          = 512;
-        return dflash27b::run_qwen3_daemon(q3args);
+        return dflash::common::run_qwen3_daemon(q3args);
     }
 
     // ---- Arch dispatch: gemma4 targets to the dedicated daemon -----
@@ -1002,13 +1002,13 @@ int main(int argc, char ** argv) {
         std::fprintf(stderr,
             "[test_dflash] arch=gemma4 -> dispatching to run_gemma4_daemon "
             "(max_ctx=%d stream_fd=%d)\n", max_ctx_eff, stream_fd);
-        dflash27b::Gemma4DaemonArgs g4args;
+        dflash::common::Gemma4DaemonArgs g4args;
         g4args.model_path     = target_path;
         g4args.device.gpu     = target_gpu;
         g4args.device.max_ctx = max_ctx_eff;
         g4args.stream_fd      = stream_fd;
         g4args.chunk          = 512;
-        return dflash27b::run_gemma4_daemon(g4args);
+        return dflash::common::run_gemma4_daemon(g4args);
     }
 
     // Helper: write a committed token to the stream fd immediately (int32 LE).
@@ -1073,7 +1073,7 @@ int main(int argc, char ** argv) {
             return 2;
         }
         if (daemon_mode) {
-            dflash27b::Qwen35LayerSplitDaemonArgs lsargs;
+            dflash::common::Qwen35LayerSplitDaemonArgs lsargs;
             lsargs.target_path = target_path;
             lsargs.draft_path  = draft_path;
             lsargs.device.layer_split_gpus    = target_gpus;
@@ -1127,7 +1127,7 @@ int main(int argc, char ** argv) {
     // loop remains for one-shot, test-window, and profile-scaling modes.
     if (daemon_mode && target_gpus.size() <= 1) {
         const int max_ctx_eff = g_max_ctx_override > 0 ? g_max_ctx_override : 4096;
-        dflash27b::Qwen35DaemonArgs qargs;
+        dflash::common::Qwen35DaemonArgs qargs;
         qargs.target_path       = target_path;
         qargs.draft_path        = draft_path;
         qargs.device.gpu        = target_gpu;
@@ -1149,7 +1149,7 @@ int main(int argc, char ** argv) {
         std::fprintf(stderr,
             "[test_dflash] arch=qwen35 daemon -> dispatching to run_qwen35_daemon "
             "(max_ctx=%d stream_fd=%d)\n", max_ctx_eff, stream_fd);
-        return dflash27b::run_qwen35_daemon(qargs);
+        return dflash::common::run_qwen35_daemon(qargs);
     }
 
     const bool split_gpus = target_gpu != draft_gpu;
@@ -1509,7 +1509,7 @@ int main(int argc, char ** argv) {
     bool target_parked = false;
     bool draft_parked  = false;
     // pflash drafter (lazy-loaded on first `compress` command)
-    dflash27b::DrafterContext drafter_ctx;
+    dflash::common::DrafterContext drafter_ctx;
     bool drafter_loaded = false;
 
     while (true) {
@@ -1558,7 +1558,7 @@ int main(int argc, char ** argv) {
             }
             if (line == "free drafter" || line == "drafter free") {
                 if (drafter_loaded) {
-                    dflash27b::free_drafter(drafter_ctx);
+                    dflash::common::free_drafter(drafter_ctx);
                     drafter_loaded = false;
                     std::printf("[drafter] freed\n"); std::fflush(stdout);
                 }
@@ -1618,8 +1618,8 @@ int main(int argc, char ** argv) {
                                  "[compress] bad args, need: <bin> <keep_x1000> <drafter_gguf> [drafter_arch]\n");
                     stream_emit(-1); continue;
                 }
-                dflash27b::DrafterArch drafter_arch;
-                if (!dflash27b::parse_drafter_arch(arch_name, drafter_arch)) {
+                dflash::common::DrafterArch drafter_arch;
+                if (!dflash::common::parse_drafter_arch(arch_name, drafter_arch)) {
                     std::fprintf(stderr, "[compress] bad drafter_arch: %s\n", arch_name);
                     stream_emit(-1); continue;
                 }
@@ -1651,30 +1651,30 @@ int main(int argc, char ** argv) {
                 }
 
                 if (!drafter_loaded) {
-                    if (!dflash27b::load_drafter(drafter_path, /*gpu_layers=*/999, drafter_arch, drafter_ctx)) {
+                    if (!dflash::common::load_drafter(drafter_path, /*gpu_layers=*/999, drafter_arch, drafter_ctx)) {
                         std::fprintf(stderr, "[compress] load_drafter failed: %s\n",
                                      dflash27b_last_error());
                         stream_emit(-1); continue;
                     }
                     drafter_loaded = true;
-                    if (drafter_arch == dflash27b::DrafterArch::Qwen3_0p6b) {
+                    if (drafter_arch == dflash::common::DrafterArch::Qwen3_0p6b) {
                         std::printf("[drafter] loaded %s arch=%s (n_layer=%d n_head=%d n_head_kv=%d)\n",
-                                    drafter_path, dflash27b::drafter_arch_name(drafter_arch), drafter_ctx.weights.n_layer,
+                                    drafter_path, dflash::common::drafter_arch_name(drafter_arch), drafter_ctx.weights.n_layer,
                                     drafter_ctx.weights.n_head, drafter_ctx.weights.n_head_kv);
                     } else {
                         std::printf("[drafter] loaded %s arch=%s\n",
-                                    drafter_path, dflash27b::drafter_arch_name(drafter_arch));
+                                    drafter_path, dflash::common::drafter_arch_name(drafter_arch));
                     }
                     std::fflush(stdout);
                 } else if (drafter_ctx.arch != drafter_arch) {
                     std::fprintf(stderr, "[compress] requested arch=%s but loaded arch=%s\n",
-                                 dflash27b::drafter_arch_name(drafter_arch),
-                                 dflash27b::drafter_arch_name(drafter_ctx.arch));
+                                 dflash::common::drafter_arch_name(drafter_arch),
+                                 dflash::common::drafter_arch_name(drafter_ctx.arch));
                     stream_emit(-1); continue;
                 }
 
                 float keep = (float)keep_x1000 / 1000.0f;
-                auto compressed = dflash27b::drafter_score_and_compress(
+                auto compressed = dflash::common::drafter_score_and_compress(
                     drafter_ctx, src_ids, keep);
                 std::printf("[compress] %zu -> %zu tokens (keep_ratio=%.3f)\n",
                             src_ids.size(), compressed.size(), keep);
diff --git a/dflash/test/test_flashprefill_kernels.cpp b/dflash/test/test_flashprefill_kernels.cpp
index 357d79628..7b3927269 100644
--- a/dflash/test/test_flashprefill_kernels.cpp
+++ b/dflash/test/test_flashprefill_kernels.cpp
@@ -248,7 +248,7 @@ int main() {
         CK(cudaMemcpy(bdK, bK.data(), bK.size() * sizeof(__nv_bfloat16), cudaMemcpyHostToDevice));
         CK(cudaMemcpy(bdV, bV.data(), bV.size() * sizeof(__nv_bfloat16), cudaMemcpyHostToDevice));
 
-        dflash27b::flashprefill::FlashPrefillConfig cfg;
+        dflash::common::flashprefill::FlashPrefillConfig cfg;
         cfg.block_size = BL;
         cfg.attention_sink = 2;
         cfg.window = 4;
@@ -256,7 +256,7 @@ int main() {
         cfg.alpha = 0.12f;
 
         // Warm-up
-        dflash27b::flashprefill::flash_prefill_forward_bf16(
+        dflash::common::flashprefill::flash_prefill_forward_bf16(
             bdQ, bdK, bdV, bdO, BB, BS, BH, BHk, BD,
             1.0f / std::sqrt((float)BD), cfg);
         CK(cudaDeviceSynchronize());
@@ -266,7 +266,7 @@ int main() {
         cudaEventCreate(&e_b);
         cudaEventRecord(e_a);
         for (int it = 0; it < 5; ++it) {
-            dflash27b::flashprefill::flash_prefill_forward_bf16(
+            dflash::common::flashprefill::flash_prefill_forward_bf16(
                 bdQ, bdK, bdV, bdO, BB, BS, BH, BHk, BD,
                 1.0f / std::sqrt((float)BD), cfg);
         }
diff --git a/dflash/test/test_generate.cpp b/dflash/test/test_generate.cpp
index 0de937a42..0854a7e00 100644
--- a/dflash/test/test_generate.cpp
+++ b/dflash/test/test_generate.cpp
@@ -46,7 +46,7 @@
 #include <unistd.h>
 #endif
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 struct StepGraph {
     ggml_context *    ctx = nullptr;
diff --git a/dflash/test/test_laguna_daemon.cpp b/dflash/test/test_laguna_daemon.cpp
index dd1540581..ab6470447 100644
--- a/dflash/test/test_laguna_daemon.cpp
+++ b/dflash/test/test_laguna_daemon.cpp
@@ -1,4 +1,4 @@
-// Thin wrapper around dflash27b::run_laguna_daemon().
+// Thin wrapper around dflash::common::run_laguna_daemon().
 //
 // Kept as a separate binary so scripts/laguna_pflash_niah.py can spawn the
 // laguna daemon directly without going through test_dflash. The actual
@@ -28,7 +28,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    dflash27b::LagunaDaemonArgs args;
+    dflash::common::LagunaDaemonArgs args;
     args.target_path = argv[1];
 
     auto need_arg = [&](int i) {
@@ -60,5 +60,5 @@ int main(int argc, char ** argv) {
         }
     }
 
-    return dflash27b::run_laguna_daemon(args);
+    return dflash::common::run_laguna_daemon(args);
 }
diff --git a/dflash/test/test_restore_delta.cpp b/dflash/test/test_restore_delta.cpp
index 3577ada51..2b69f610a 100644
--- a/dflash/test/test_restore_delta.cpp
+++ b/dflash/test/test_restore_delta.cpp
@@ -14,7 +14,7 @@ static void check(bool ok, const char * msg) {
 }
 
 int main() {
-    using dflash27b::restore_prompt_delta;
+    using dflash::common::restore_prompt_delta;
 
     // Regression for #216: RESTORE receives the full prompt, but the backend
     // must prefill only the suffix that was not covered by the cached snapshot.
diff --git a/dflash/test/test_server_unit.cpp b/dflash/test/test_server_unit.cpp
index 4c229d92f..9238873d3 100644
--- a/dflash/test/test_server_unit.cpp
+++ b/dflash/test/test_server_unit.cpp
@@ -28,7 +28,7 @@
 #include <unistd.h>
 
 using json = nlohmann::json;
-using namespace dflash27b;
+using namespace dflash::common;
 
 // ─── Test framework (ds4 style) ────────────────────────────────────────
 
diff --git a/dflash/test/test_tokenizer_harness.cpp b/dflash/test/test_tokenizer_harness.cpp
index 6aea14bec..666dbe8be 100644
--- a/dflash/test/test_tokenizer_harness.cpp
+++ b/dflash/test/test_tokenizer_harness.cpp
@@ -20,7 +20,7 @@
 #include <string>
 
 using json = nlohmann::json;
-using namespace dflash27b;
+using namespace dflash::common;
 
 int main(int argc, char ** argv) {
     if (argc < 2) {
diff --git a/dflash/test/test_vs_oracle.cpp b/dflash/test/test_vs_oracle.cpp
index 22f547b15..7d1ef3437 100644
--- a/dflash/test/test_vs_oracle.cpp
+++ b/dflash/test/test_vs_oracle.cpp
@@ -26,7 +26,7 @@
 #include <string>
 #include <vector>
 
-using namespace dflash27b;
+using namespace dflash::common;
 
 struct OracleMeta {
     int ctx_len = 0;