diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt index d7c0bb681..2bdbb2199 100644 --- a/dflash/CMakeLists.txt +++ b/dflash/CMakeLists.txt @@ -4,9 +4,9 @@ set_property(CACHE DFLASH27B_GPU_BACKEND PROPERTY STRINGS cuda hip) string(TOLOWER "${DFLASH27B_GPU_BACKEND}" DFLASH27B_GPU_BACKEND) if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") set(DFLASH27B_USER_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}") - project(dflash27b LANGUAGES C CXX CUDA) + project(dflash LANGUAGES C CXX CUDA) elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip") - project(dflash27b LANGUAGES C CXX HIP) + project(dflash LANGUAGES C CXX HIP) else() message(FATAL_ERROR "DFLASH27B_GPU_BACKEND must be 'cuda' or 'hip', got '${DFLASH27B_GPU_BACKEND}'") endif() @@ -31,21 +31,21 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") # ROCm root for HIP builds (rpath + rocwmma header discovery). if(DFLASH27B_GPU_BACKEND STREQUAL "hip") if(DEFINED ROCM_PATH) - set(_dflash27b_rocm_root "${ROCM_PATH}") + set(_dflash_rocm_root "${ROCM_PATH}") elseif(DEFINED ENV{ROCM_PATH}) - set(_dflash27b_rocm_root "$ENV{ROCM_PATH}") + set(_dflash_rocm_root "$ENV{ROCM_PATH}") elseif(EXISTS "/opt/rocm") - set(_dflash27b_rocm_root "/opt/rocm") + set(_dflash_rocm_root "/opt/rocm") else() - set(_dflash27b_rocm_root "") + set(_dflash_rocm_root "") endif() endif() # Bake portable rpath into all executables so bundled ggml backend libs / libggml-base # are found regardless of LD_LIBRARY_PATH or stale /usr/local/lib (closes #31). set(CMAKE_INSTALL_RPATH "$ORIGIN/deps/llama.cpp/ggml/src;$ORIGIN/deps/llama.cpp/ggml/src/ggml-cuda;$ORIGIN/deps/llama.cpp/ggml/src/ggml-hip;$ORIGIN/../deps/llama.cpp/ggml/src;$ORIGIN/../deps/llama.cpp/ggml/src/ggml-cuda;$ORIGIN/../deps/llama.cpp/ggml/src/ggml-hip") -if(DFLASH27B_GPU_BACKEND STREQUAL "hip" AND _dflash27b_rocm_root) - list(APPEND CMAKE_BUILD_RPATH "${_dflash27b_rocm_root}/lib" "${_dflash27b_rocm_root}/lib64") - set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${_dflash27b_rocm_root}/lib;${_dflash27b_rocm_root}/lib64") +if(DFLASH27B_GPU_BACKEND STREQUAL "hip" AND _dflash_rocm_root) + list(APPEND CMAKE_BUILD_RPATH "${_dflash_rocm_root}/lib" "${_dflash_rocm_root}/lib64") + set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH};${_dflash_rocm_root}/lib;${_dflash_rocm_root}/lib64") endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) @@ -114,33 +114,33 @@ if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") # (110 on CUDA 13+) added when nvcc supports them. DGX Spark / # GB10 is compute capability 12.1 (121), added at CUDA 12.9+. if(DFLASH27B_USER_CUDA_ARCHITECTURES) - set(_dflash27b_archs "${DFLASH27B_USER_CUDA_ARCHITECTURES}") + set(_dflash_archs "${DFLASH27B_USER_CUDA_ARCHITECTURES}") else() - set(_dflash27b_archs "60;61;62;70;75;86") + set(_dflash_archs "60;61;62;70;75;86") if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.8") - list(APPEND _dflash27b_archs "120") + list(APPEND _dflash_archs "120") endif() if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0") - list(APPEND _dflash27b_archs "110") + list(APPEND _dflash_archs "110") endif() if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.9") - list(APPEND _dflash27b_archs "121") + list(APPEND _dflash_archs "121") endif() endif() elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip") # User override precedence: -DDFLASH27B_HIP_ARCHITECTURES → -DAMDGPU_TARGETS # → gfx1151 default (Strix Halo). if(DFLASH27B_HIP_ARCHITECTURES) - set(_dflash27b_archs "${DFLASH27B_HIP_ARCHITECTURES}") + set(_dflash_archs "${DFLASH27B_HIP_ARCHITECTURES}") elseif(AMDGPU_TARGETS) - set(_dflash27b_archs "${AMDGPU_TARGETS}") + set(_dflash_archs "${AMDGPU_TARGETS}") else() - set(_dflash27b_archs "gfx1151") + set(_dflash_archs "gfx1151") endif() # Make sure the HIP language picks up the resolved arch list. - set(CMAKE_HIP_ARCHITECTURES "${_dflash27b_archs}" CACHE STRING "" FORCE) + set(CMAKE_HIP_ARCHITECTURES "${_dflash_archs}" CACHE STRING "" FORCE) else() - set(_dflash27b_archs "") + set(_dflash_archs "") endif() # Consumer Blackwell workaround: skip sm_12x→sm_12xa replacement and FP4 @@ -156,9 +156,9 @@ endif() if(DFLASH27B_GPU_BACKEND STREQUAL "cuda" AND NOT DEFINED _dflash_is_consumer_blackwell) set(_dflash_is_consumer_blackwell OFF) - # Iterate the resolved dflash27b arch list, not raw CMAKE_CUDA_ARCHITECTURES, + # Iterate the resolved dflash_common arch list, not raw CMAKE_CUDA_ARCHITECTURES, # which is empty on the default path (the project supplies its own list above). - foreach(_arch IN LISTS _dflash27b_archs) + foreach(_arch IN LISTS _dflash_archs) string(REGEX REPLACE "[^0-9]" "" _dflash_arch_num "${_arch}") if(_dflash_arch_num MATCHES "^12[0-9]$") set(_dflash_is_consumer_blackwell ON) @@ -196,7 +196,7 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip") find_package(hip REQUIRED) endif() -# ─── dflash27b static library ────────────────────────────────────── +# ─── dflash_common static library ────────────────────────────────────── set(DFLASH27B_SRC_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/src @@ -209,7 +209,7 @@ set(DFLASH27B_SRC_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/src/server ) -add_library(dflash27b STATIC +add_library(dflash_common STATIC src/errors.cpp src/qwen35/gguf_target_loader.cpp src/qwen35/qwen35_target_graph.cpp @@ -283,21 +283,21 @@ endif() # Apply the arch list to local sources. HIP builds pass CMAKE_HIP_ARCHITECTURES # through to the HIP toolchain/ggml backend and optionally build rocWMMA kernels. if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") - target_sources(dflash27b PRIVATE src/cuda_cross_device_copy.cpp) - set_target_properties(dflash27b PROPERTIES CUDA_ARCHITECTURES "${_dflash27b_archs}") - list(GET _dflash27b_archs 0 _dflash27b_cuda_min_sm) + target_sources(dflash_common PRIVATE src/cuda_cross_device_copy.cpp) + set_target_properties(dflash_common PROPERTIES CUDA_ARCHITECTURES "${_dflash_archs}") + list(GET _dflash_archs 0 _dflash_cuda_min_sm) # Strip any trailing 'a' suffix (e.g. "121a" -> "121") - string(REGEX REPLACE "[^0-9]" "" _dflash27b_cuda_min_sm "${_dflash27b_cuda_min_sm}") - target_compile_definitions(dflash27b PRIVATE + string(REGEX REPLACE "[^0-9]" "" _dflash_cuda_min_sm "${_dflash_cuda_min_sm}") + target_compile_definitions(dflash_common PRIVATE DFLASH27B_BACKEND_CUDA=1 - DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm} - DFLASH27B_MIN_SM=${_dflash27b_cuda_min_sm}) + DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm} + DFLASH27B_MIN_SM=${_dflash_cuda_min_sm}) elseif(DFLASH27B_GPU_BACKEND STREQUAL "hip") - set_target_properties(dflash27b PROPERTIES HIP_ARCHITECTURES "${_dflash27b_archs}") - target_compile_definitions(dflash27b PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP) - # hip_compat shim is needed by ALL dflash27b sources (peer_access.cpp, + set_target_properties(dflash_common PROPERTIES HIP_ARCHITECTURES "${_dflash_archs}") + target_compile_definitions(dflash_common PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP) + # hip_compat shim is needed by ALL dflash_common sources (peer_access.cpp, # dflash_feature_ring.cpp, flashprefill.cpp), not just the SM80_EQUIV path. - target_include_directories(dflash27b PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat) + target_include_directories(dflash_common PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat) endif() # FlashPrefill custom kernels. @@ -310,11 +310,11 @@ endif() # BF16 buffers → bf16 WMMA kernel; F16 buffers → f16 WMMA kernel; else → ggml FA. if(DFLASH27B_GPU_BACKEND STREQUAL "hip") # rms_norm_hip.cu is needed by the HIP chunk-B graph path regardless of SM80_EQUIV. - target_sources(dflash27b PRIVATE src/rms_norm_hip.cu) + target_sources(dflash_common PRIVATE src/rms_norm_hip.cu) set_source_files_properties(src/rms_norm_hip.cu PROPERTIES LANGUAGE HIP) if(DFLASH27B_HIP_SM80_EQUIV) find_path(DFLASH27B_ROCWMMA_INCLUDE_DIR rocwmma/rocwmma.hpp - HINTS "${_dflash27b_rocm_root}/include" /opt/rocm/include + HINTS "${_dflash_rocm_root}/include" /opt/rocm/include NO_DEFAULT_PATH) if(NOT DFLASH27B_ROCWMMA_INCLUDE_DIR) message(FATAL_ERROR @@ -323,16 +323,16 @@ if(DFLASH27B_GPU_BACKEND STREQUAL "hip") "or fetch headers from https://github.com/ROCm/rocWMMA), or rebuild " "with -DDFLASH27B_HIP_SM80_EQUIV=OFF (uses the slower q8 fallback).") endif() - target_sources(dflash27b PRIVATE + target_sources(dflash_common PRIVATE src/flashprefill_kernels.hip.cu src/flashprefill_select.cpp src/flashprefill.cpp) set_source_files_properties(src/flashprefill_kernels.hip.cu PROPERTIES LANGUAGE HIP) - target_include_directories(dflash27b PRIVATE + target_include_directories(dflash_common PRIVATE ${DFLASH27B_ROCWMMA_INCLUDE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/hip_compat) - target_compile_definitions(dflash27b PRIVATE + target_compile_definitions(dflash_common PRIVATE DFLASH27B_HAVE_FLASHPREFILL=1 DFLASH27B_CUDA_MIN_SM=80) message(STATUS "dflash: HIP Phase 2 - rocWMMA flashprefill kernels enabled") @@ -340,7 +340,7 @@ if(DFLASH27B_GPU_BACKEND STREQUAL "hip") message(STATUS "dflash: HIP Phase 1 - ggml q8 fallback for flashprefill") endif() elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda") - target_sources(dflash27b PRIVATE + target_sources(dflash_common PRIVATE src/flashprefill_select.cpp src/flashprefill.cpp) # Multi-arch: scan all resolved arches and compile every applicable @@ -349,7 +349,7 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda") set(_dflash_has_sm80 OFF) set(_dflash_has_sm70 OFF) set(_dflash_has_sm60 OFF) - foreach(_arch IN LISTS _dflash27b_archs) + foreach(_arch IN LISTS _dflash_archs) string(REGEX REPLACE "[^0-9]" "" _arch_num "${_arch}") if(_arch_num GREATER_EQUAL 80) set(_dflash_has_sm80 ON) @@ -363,19 +363,19 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda") endforeach() # Also honour the single-arch path (min_sm) so non-multi builds keep # their existing behaviour when only one tier matches. - if(_dflash27b_cuda_min_sm GREATER_EQUAL 80) + if(_dflash_cuda_min_sm GREATER_EQUAL 80) set(_dflash_has_sm80 ON) - elseif(_dflash27b_cuda_min_sm GREATER_EQUAL 70) + elseif(_dflash_cuda_min_sm GREATER_EQUAL 70) set(_dflash_has_sm70 ON) - elseif(_dflash27b_cuda_min_sm GREATER_EQUAL 60) + elseif(_dflash_cuda_min_sm GREATER_EQUAL 60) set(_dflash_has_sm60 ON) endif() if(_dflash_has_sm80) - target_sources(dflash27b PRIVATE + target_sources(dflash_common PRIVATE src/flashprefill_kernels.cu src/pflash_ggml_adapter.cpp) - target_compile_definitions(dflash27b PRIVATE + target_compile_definitions(dflash_common PRIVATE DFLASH27B_HAVE_CUDA_WMMA_FLASHPREFILL=1 DFLASH27B_HAVE_SM80_FLASHPREFILL=1) # BF16 WMMA kernels require sm_80+. Restrict compilation to those @@ -384,8 +384,8 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda") CUDA_ARCHITECTURES "80;86;90;100;110;120;121") endif() if(_dflash_has_sm70) - target_sources(dflash27b PRIVATE src/flashprefill_f16.cu) - target_compile_definitions(dflash27b PRIVATE + target_sources(dflash_common PRIVATE src/flashprefill_f16.cu) + target_compile_definitions(dflash_common PRIVATE DFLASH27B_HAVE_CUDA_WMMA_FLASHPREFILL=1 DFLASH27B_HAVE_VOLTA_FLASHPREFILL=1) # F16 WMMA kernels require sm_70+. Restrict compilation to Volta/Turing @@ -394,8 +394,8 @@ elseif(DFLASH27B_GPU_BACKEND STREQUAL "cuda") CUDA_ARCHITECTURES "70;75") endif() if(_dflash_has_sm60) - target_sources(dflash27b PRIVATE src/flashprefill_scalar.cu) - target_compile_definitions(dflash27b PRIVATE + target_sources(dflash_common PRIVATE src/flashprefill_scalar.cu) + target_compile_definitions(dflash_common PRIVATE DFLASH27B_HAVE_CUDA_SCALAR_FLASHPREFILL=1 DFLASH27B_HAVE_PASCAL_FLASHPREFILL=1) # Scalar Pascal kernels only target sm_60-69. @@ -414,7 +414,7 @@ if(DFLASH27B_GPU_BACKEND STREQUAL "hip" AND DFLASH27B_ENABLE_BSA AND NOT DFLASH2 set(DFLASH27B_ENABLE_BSA OFF) endif() if(DFLASH27B_ENABLE_BSA) - foreach(_arch IN LISTS _dflash27b_archs) + foreach(_arch IN LISTS _dflash_archs) if(_arch LESS 80) message(WARNING "DFLASH27B_ENABLE_BSA=ON requested but CUDA_ARCHITECTURES contains '${_arch}' (<80); " @@ -439,14 +439,14 @@ endif() if(DFLASH27B_ENABLE_BSA) if(DFLASH27B_GPU_BACKEND STREQUAL "hip") # HIP BSA: backed by our rocWMMA sparse-FA kernel; no CUTLASS needed. - target_sources(dflash27b PRIVATE src/bsa_launcher_hip.cu) + target_sources(dflash_common PRIVATE src/bsa_launcher_hip.cu) set_source_files_properties(src/bsa_launcher_hip.cu PROPERTIES LANGUAGE HIP) else() - target_sources(dflash27b PRIVATE src/bsa_fwd_inst.cu src/bsa_launcher.cu) + target_sources(dflash_common PRIVATE src/bsa_fwd_inst.cu src/bsa_launcher.cu) endif() endif() -target_include_directories(dflash27b +target_include_directories(dflash_common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include PRIVATE @@ -454,30 +454,30 @@ target_include_directories(dflash27b ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/src ) if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") - target_include_directories(dflash27b PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) + target_include_directories(dflash_common PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) endif() if(DFLASH27B_ENABLE_BSA) if(DFLASH27B_GPU_BACKEND STREQUAL "hip") # HIP path: bsa_launcher_hip.cu only needs DFLASH27B_HAVE_BSA + the # hip_compat shim (which the Phase 2 block above already adds). - target_compile_definitions(dflash27b PRIVATE DFLASH27B_HAVE_BSA=1) + target_compile_definitions(dflash_common PRIVATE DFLASH27B_HAVE_BSA=1) else() - target_include_directories(dflash27b PRIVATE + target_include_directories(dflash_common PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/deps/bsa_stubs ${CMAKE_CURRENT_SOURCE_DIR}/deps/Block-Sparse-Attention/csrc/cutlass/include ${CMAKE_CURRENT_SOURCE_DIR}/deps/Block-Sparse-Attention/csrc/block_sparse_attn/src) - target_compile_options(dflash27b PRIVATE $<$:--expt-relaxed-constexpr>) - target_compile_definitions(dflash27b PRIVATE FLASHATTENTION_DISABLE_DROPOUT FLASH_NAMESPACE=flash DFLASH27B_HAVE_BSA=1) + target_compile_options(dflash_common PRIVATE $<$:--expt-relaxed-constexpr>) + target_compile_definitions(dflash_common PRIVATE FLASHATTENTION_DISABLE_DROPOUT FLASH_NAMESPACE=flash DFLASH27B_HAVE_BSA=1) # MSVC's hides POSIX M_* macros (M_LOG2E etc.) unless _USE_MATH_DEFINES # is set before any cmath include. BSA's softmax.h relies on M_LOG2E; define # globally on the target so it precedes every TU's first include. if(WIN32) - target_compile_definitions(dflash27b PRIVATE _USE_MATH_DEFINES) + target_compile_definitions(dflash_common PRIVATE _USE_MATH_DEFINES) endif() endif() endif() -target_link_libraries(dflash27b +target_link_libraries(dflash_common PUBLIC ggml ${DFLASH27B_GGML_BACKEND_TARGET} @@ -485,15 +485,15 @@ target_link_libraries(dflash27b nlohmann_json::nlohmann_json ) if(DFLASH27B_GPU_BACKEND STREQUAL "hip") - target_link_libraries(dflash27b PRIVATE hip::host) + target_link_libraries(dflash_common PRIVATE hip::host) endif() if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") - target_compile_options(dflash27b PRIVATE + target_compile_options(dflash_common PRIVATE $<$:-Wall -Wextra -Wno-unused-parameter -Wno-unused-function> ) elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") - target_compile_options(dflash27b PRIVATE + target_compile_options(dflash_common PRIVATE $<$:/W4 /permissive-> ) endif() @@ -503,32 +503,32 @@ endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/pflash_daemon.cpp") add_executable(pflash_daemon test/pflash_daemon.cpp) target_include_directories(pflash_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(pflash_daemon PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(pflash_daemon PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() # ─── Tests (numerics vs oracle) ──────────────────────────────────── option(DFLASH27B_TESTS "Build numerics tests" ON) if(DFLASH27B_TESTS) - if(DFLASH27B_GPU_BACKEND STREQUAL "cuda" AND _dflash27b_cuda_min_sm GREATER_EQUAL 80 AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flashprefill_kernels.cpp") + if(DFLASH27B_GPU_BACKEND STREQUAL "cuda" AND _dflash_cuda_min_sm GREATER_EQUAL 80 AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flashprefill_kernels.cpp") add_executable(test_flashprefill_kernels test/test_flashprefill_kernels.cpp) - set_target_properties(test_flashprefill_kernels PROPERTIES CUDA_ARCHITECTURES "${_dflash27b_archs}") + set_target_properties(test_flashprefill_kernels PROPERTIES CUDA_ARCHITECTURES "${_dflash_archs}") target_include_directories(test_flashprefill_kernels PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) - target_link_libraries(test_flashprefill_kernels PRIVATE dflash27b CUDA::cudart) + target_link_libraries(test_flashprefill_kernels PRIVATE dflash_common CUDA::cudart) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_kv_quant.cpp") add_executable(test_kv_quant test/test_kv_quant.cpp) target_include_directories(test_kv_quant PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(test_kv_quant PRIVATE dflash27b) + target_link_libraries(test_kv_quant PRIVATE dflash_common) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_draft_vs_reference.cpp") add_executable(test_draft_vs_reference test/test_draft_vs_reference.cpp) - target_link_libraries(test_draft_vs_reference PRIVATE dflash27b) + target_link_libraries(test_draft_vs_reference PRIVATE dflash_common) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_draft.cpp") add_executable(smoke_load_draft test/smoke_load_draft.cpp) target_include_directories(smoke_load_draft PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(smoke_load_draft PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(smoke_load_draft PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/spike_thin_copy.cpp") add_executable(spike_thin_copy test/spike_thin_copy.cpp) @@ -538,62 +538,62 @@ if(DFLASH27B_TESTS) if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_draft_graph.cpp") add_executable(smoke_draft_graph test/smoke_draft_graph.cpp) target_include_directories(smoke_draft_graph PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(smoke_draft_graph PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(smoke_draft_graph PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_qwen3_forward.cpp") add_executable(smoke_qwen3_forward test/smoke_qwen3_forward.cpp) target_include_directories(smoke_qwen3_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(smoke_qwen3_forward PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(smoke_qwen3_forward PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_vs_oracle.cpp") add_executable(test_vs_oracle test/test_vs_oracle.cpp) target_include_directories(test_vs_oracle PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(test_vs_oracle PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(test_vs_oracle PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target.cpp") add_executable(smoke_load_target test/smoke_load_target.cpp) target_include_directories(smoke_load_target PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(smoke_load_target PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(smoke_load_target PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_load_target_laguna.cpp") add_executable(smoke_load_target_laguna test/smoke_load_target_laguna.cpp) target_include_directories(smoke_load_target_laguna PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(smoke_load_target_laguna PRIVATE dflash27b ggml ggml-cuda) + target_link_libraries(smoke_load_target_laguna PRIVATE dflash_common ggml ggml-cuda) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_laguna_forward.cpp") add_executable(smoke_laguna_forward test/smoke_laguna_forward.cpp) target_include_directories(smoke_laguna_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(smoke_laguna_forward PRIVATE dflash27b ggml ggml-cuda) + target_link_libraries(smoke_laguna_forward PRIVATE dflash_common ggml ggml-cuda) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_ttft.cpp") add_executable(bench_laguna_ttft test/bench_laguna_ttft.cpp) target_include_directories(bench_laguna_ttft PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(bench_laguna_ttft PRIVATE dflash27b ggml ggml-cuda) + target_link_libraries(bench_laguna_ttft PRIVATE dflash_common ggml ggml-cuda) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_pflash.cpp") add_executable(bench_laguna_pflash test/bench_laguna_pflash.cpp) target_include_directories(bench_laguna_pflash PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(bench_laguna_pflash PRIVATE dflash27b ggml ggml-cuda) + target_link_libraries(bench_laguna_pflash PRIVATE dflash_common ggml ggml-cuda) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_generate.cpp") add_executable(bench_laguna_generate test/bench_laguna_generate.cpp) target_include_directories(bench_laguna_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(bench_laguna_generate PRIVATE dflash27b ggml ggml-cuda) + target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ggml-cuda) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_laguna_daemon.cpp") add_executable(test_laguna_daemon test/test_laguna_daemon.cpp) target_include_directories(test_laguna_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(test_laguna_daemon PRIVATE dflash27b ggml ggml-cuda) + target_link_libraries(test_laguna_daemon PRIVATE dflash_common ggml ggml-cuda) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/smoke_target_forward.cpp") add_executable(smoke_target_forward test/smoke_target_forward.cpp) target_include_directories(smoke_target_forward PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(smoke_target_forward PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(smoke_target_forward PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_generate.cpp") add_executable(test_generate test/test_generate.cpp) target_include_directories(test_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) - target_link_libraries(test_generate PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(test_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_restore_delta.cpp") add_executable(test_restore_delta test/test_restore_delta.cpp) @@ -607,9 +607,9 @@ if(DFLASH27B_TESTS) else() target_compile_definitions(test_dflash PRIVATE DFLASH27B_BACKEND_CUDA=1 - DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm}) + DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm}) endif() - target_link_libraries(test_dflash PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(test_dflash PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) # test_dflash uses cuda*/hip* runtime calls directly for fast rollback, # peer access, and target-layer split copies. Link the selected runtime. if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") @@ -637,9 +637,9 @@ if(DFLASH27B_TESTS) else() target_compile_definitions(dflash_server PRIVATE DFLASH27B_BACKEND_CUDA=1 - DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm}) + DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm}) endif() - target_link_libraries(dflash_server PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread) + target_link_libraries(dflash_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread) if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") find_package(CUDAToolkit REQUIRED) target_link_libraries(dflash_server PRIVATE CUDA::cudart) @@ -657,9 +657,9 @@ if(DFLASH27B_TESTS) else() target_compile_definitions(test_tokenizer_harness PRIVATE DFLASH27B_BACKEND_CUDA=1 - DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm}) + DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm}) endif() - target_link_libraries(test_tokenizer_harness PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(test_tokenizer_harness PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") find_package(CUDAToolkit REQUIRED) target_link_libraries(test_tokenizer_harness PRIVATE CUDA::cudart) @@ -679,9 +679,9 @@ if(DFLASH27B_TESTS) else() target_compile_definitions(test_server_unit PRIVATE DFLASH27B_BACKEND_CUDA=1 - DFLASH27B_CUDA_MIN_SM=${_dflash27b_cuda_min_sm}) + DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm}) endif() - target_link_libraries(test_server_unit PRIVATE dflash27b ggml ${DFLASH27B_GGML_BACKEND_TARGET}) + target_link_libraries(test_server_unit PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET}) if(DFLASH27B_GPU_BACKEND STREQUAL "cuda") find_package(CUDAToolkit REQUIRED) target_link_libraries(test_server_unit PRIVATE CUDA::cudart) @@ -723,12 +723,12 @@ if(DFLASH27B_TESTS) endforeach() # Gated on the same condition as src/pflash_ggml_adapter.cpp above: - # the adapter is only compiled into dflash27b when the CUDA arch list + # the adapter is only compiled into dflash_common when the CUDA arch list # includes an sm_80+ target, so this test can only link there too. if(DFLASH27B_GPU_BACKEND STREQUAL "cuda" AND _dflash_has_sm80 AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_flash_attn_sparse.cpp") add_executable(test_flash_attn_sparse test/test_flash_attn_sparse.cpp) - target_link_libraries(test_flash_attn_sparse PRIVATE dflash27b ggml ggml-cuda ggml-base) + target_link_libraries(test_flash_attn_sparse PRIVATE dflash_common ggml ggml-cuda ggml-base) target_include_directories(test_flash_attn_sparse PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/include ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/src diff --git a/dflash/README.md b/dflash/README.md index e22ee6245..35bc2a2b7 100644 --- a/dflash/README.md +++ b/dflash/README.md @@ -210,7 +210,7 @@ Full `bench_llm.py` suite on Qwen3.6-27B UD-Q4_K_XL, 10 prompts, n_gen=256, RTX and dispatches by arch: - `qwen35` / `qwen36` → existing DFlash + DDTree pipeline (no change). - - `laguna` → `dflash27b::run_laguna_daemon()` (no spec-decode, no DDTree). + - `laguna` → `dflash::common::run_laguna_daemon()` (no spec-decode, no DDTree). The daemon stdin/stream-fd protocol is identical, so `scripts/server.py` drives both arches end-to-end. The only thing the user changes is `--target`. diff --git a/dflash/scripts/convert_dflash_to_gguf.py b/dflash/scripts/convert_dflash_to_gguf.py index 72852f173..fae1be7e5 100644 --- a/dflash/scripts/convert_dflash_to_gguf.py +++ b/dflash/scripts/convert_dflash_to_gguf.py @@ -21,7 +21,7 @@ name. Usage: - PYTHONPATH=../../dflash27b_ggml/deps/llama.cpp/gguf-py python convert_dflash_to_gguf.py \ + PYTHONPATH=../../dflash_ggml/deps/llama.cpp/gguf-py python convert_dflash_to_gguf.py \ models/draft/model.safetensors \ qwen3.5-27b-dflash-draft.gguf """ diff --git a/dflash/scripts/server.py b/dflash/scripts/server.py index 78837fd4d..01bd81967 100644 --- a/dflash/scripts/server.py +++ b/dflash/scripts/server.py @@ -132,7 +132,7 @@ def _extra_daemon_has_target_sharding(extra: list[str] | None) -> bool: # Architecture strings stored in `general.architecture` of every GGUF this # server can drive. test_dflash dispatches by GGUF arch internally: # qwen35 / qwen36 -> existing DFlash + DDTree pipeline -# laguna -> dflash27b::run_laguna_daemon() (no spec-decode) +# laguna -> dflash::common::run_laguna_daemon() (no spec-decode) # server.py just needs to omit --draft + the DFlash/DDTree flags when the # arch doesn't support speculative decoding yet. _QWEN35_ARCHES = {"qwen35", "qwen36"} @@ -843,7 +843,7 @@ async def _openai_compat_error_handler(_request: Request, exc: OpenAICompatError if arch in _LAGUNA_ARCHES: # test_dflash detects arch=laguna from the GGUF and dispatches - # internally to dflash27b::run_laguna_daemon(). No --draft, no + # internally to dflash::common::run_laguna_daemon(). No --draft, no # --fast-rollback, no --ddtree (no Laguna spec-decode draft yet). # Tokens stream as int32 LE on stream_fd terminated by -1, byte- # identical to the qwen35 path so SSE/stream consumers stay shared. diff --git a/dflash/src/bsa_launcher.cu b/dflash/src/bsa_launcher.cu index 19698afe7..258040658 100644 --- a/dflash/src/bsa_launcher.cu +++ b/dflash/src/bsa_launcher.cu @@ -28,7 +28,7 @@ template void run_mha_fwd_block_(Flash_fwd_params ¶ms, cudaStream_t stream); } -namespace dflash27b { +namespace dflash::common { namespace flashprefill { namespace { @@ -275,4 +275,4 @@ fail: } } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/bsa_launcher_hip.cu b/dflash/src/bsa_launcher_hip.cu index 0745d1c2d..870230b95 100644 --- a/dflash/src/bsa_launcher_hip.cu +++ b/dflash/src/bsa_launcher_hip.cu @@ -19,7 +19,7 @@ #include #include // size_t -namespace dflash27b { +namespace dflash::common { namespace flashprefill { // Defined in flashprefill_kernels.hip.cu. @@ -108,4 +108,4 @@ extern "C" int launch_bsa_sparse_flash_forward_bf16( } } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/attn_masks.h b/dflash/src/common/attn_masks.h index b61adab30..ec25ca561 100644 --- a/dflash/src/common/attn_masks.h +++ b/dflash/src/common/attn_masks.h @@ -10,7 +10,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // Minimum alignment required by ggml flash_attn_ext for mask rows. static constexpr int KQ_MASK_PAD = 32; @@ -75,4 +75,4 @@ inline void build_tree_mask(const DDTree & tree, int past_length, } } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/backend_factory.cpp b/dflash/src/common/backend_factory.cpp index 30da7dec0..018045694 100644 --- a/dflash/src/common/backend_factory.cpp +++ b/dflash/src/common/backend_factory.cpp @@ -10,7 +10,7 @@ #include -namespace dflash27b { +namespace dflash::common { std::string detect_arch(const char * model_path) { auto info = inspect_gguf_model_info(model_path); @@ -107,4 +107,4 @@ std::unique_ptr create_backend(const BackendArgs & args) { } } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/backend_factory.h b/dflash/src/common/backend_factory.h index e4caed652..5a4195502 100644 --- a/dflash/src/common/backend_factory.h +++ b/dflash/src/common/backend_factory.h @@ -16,7 +16,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ─── Backend creation arguments ───────────────────────────────────────── // A superset of all per-arch config fields. The factory reads only those @@ -62,4 +62,4 @@ std::unique_ptr create_backend(const BackendArgs & args); // Useful for early dispatch (e.g. printing which backend will be used). std::string detect_arch(const char * model_path); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/daemon_loop.cpp b/dflash/src/common/daemon_loop.cpp index ecc58f67d..94bd5c509 100644 --- a/dflash/src/common/daemon_loop.cpp +++ b/dflash/src/common/daemon_loop.cpp @@ -25,7 +25,7 @@ #define ssize_t long #endif -namespace dflash27b { +namespace dflash::common { // ── DaemonIO ──────────────────────────────────────────────────────────── @@ -424,4 +424,4 @@ int run_daemon(ModelBackend & backend, const DaemonLoopArgs & args) { return 0; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/daemon_loop.h b/dflash/src/common/daemon_loop.h index 7d8012ecf..de5f859b1 100644 --- a/dflash/src/common/daemon_loop.h +++ b/dflash/src/common/daemon_loop.h @@ -11,7 +11,7 @@ #include "model_backend.h" -namespace dflash27b { +namespace dflash::common { struct DaemonLoopArgs { int stream_fd = -1; @@ -23,4 +23,4 @@ struct DaemonLoopArgs { // commands until `quit`, `exit`, or EOF. Returns 0 on clean shutdown. int run_daemon(ModelBackend & backend, const DaemonLoopArgs & args); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/ddtree.cpp b/dflash/src/common/ddtree.cpp index e108f259d..08ca33464 100644 --- a/dflash/src/common/ddtree.cpp +++ b/dflash/src/common/ddtree.cpp @@ -7,7 +7,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { void extract_draft_topk(const float * logits, int n_positions, int vocab, int K, @@ -223,4 +223,4 @@ std::vector follow_verified_tree(const DDTree & tree, return accepted; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/ddtree.h b/dflash/src/common/ddtree.h index 75b8d8cce..afe22f226 100644 --- a/dflash/src/common/ddtree.h +++ b/dflash/src/common/ddtree.h @@ -13,7 +13,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // A flat DFS-ordered tree built from the draft's top-K softmax distributions. // Slot 0 is the tree root (the bonus token from the previous spec round); @@ -61,4 +61,4 @@ std::vector follow_verified_tree(const DDTree & tree, int & out_next_token, int * out_node_idx = nullptr); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/device_placement.h b/dflash/src/common/device_placement.h index 0b373e463..a6f87e171 100644 --- a/dflash/src/common/device_placement.h +++ b/dflash/src/common/device_placement.h @@ -9,7 +9,7 @@ #include -namespace dflash27b { +namespace dflash::common { struct DevicePlacement { int gpu = 0; // primary GPU (single-GPU mode) @@ -28,4 +28,4 @@ struct DevicePlacement { } }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_capture.cpp b/dflash/src/common/dflash_capture.cpp index 3b42f771f..a67fe9b9a 100644 --- a/dflash/src/common/dflash_capture.cpp +++ b/dflash/src/common/dflash_capture.cpp @@ -1,6 +1,6 @@ #include "dflash_capture.h" -namespace dflash27b { +namespace dflash::common { int target_capture_index(const int * capture_layer_ids, int n_capture_layers, @@ -12,4 +12,4 @@ int target_capture_index(const int * capture_layer_ids, return -1; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_capture.h b/dflash/src/common/dflash_capture.h index b96d34bd1..8e4415097 100644 --- a/dflash/src/common/dflash_capture.h +++ b/dflash/src/common/dflash_capture.h @@ -7,7 +7,7 @@ #pragma once -namespace dflash27b { +namespace dflash::common { // Linear search for layer_idx in capture_layer_ids[0..n_capture_layers). // Returns the capture index (0..n_capture_layers-1) on hit, -1 on miss. @@ -15,4 +15,4 @@ int target_capture_index(const int * capture_layer_ids, int n_capture_layers, int layer_idx); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_draft_graph.cpp b/dflash/src/common/dflash_draft_graph.cpp index ca46219d7..2e60acb6b 100644 --- a/dflash/src/common/dflash_draft_graph.cpp +++ b/dflash/src/common/dflash_draft_graph.cpp @@ -5,7 +5,7 @@ #include -namespace dflash27b { +namespace dflash::common { // Build draft graph at a given ctx_len into sg. Does NOT touch sg.alloc. // mirror_view: if true, uses a view into mirror->target_feat at slot0. @@ -128,4 +128,4 @@ bool build_draft_step( return ggml_gallocr_alloc_graph(sg.alloc, sg.gf); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_draft_graph.h b/dflash/src/common/dflash_draft_graph.h index e6bbf1fb7..c09110270 100644 --- a/dflash/src/common/dflash_draft_graph.h +++ b/dflash/src/common/dflash_draft_graph.h @@ -15,7 +15,7 @@ #include "ggml.h" #include "ggml-backend.h" -namespace dflash27b { +namespace dflash::common { // Draft forward: speculative next-token prediction using target features. // lm_head: optional target lm_head tensor for fused projection. When @@ -33,4 +33,4 @@ bool build_draft_step( int committed = 0, int ctx_len_max = 0); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_draft_ipc.cpp b/dflash/src/common/dflash_draft_ipc.cpp index b23005f1f..0cab347b8 100644 --- a/dflash/src/common/dflash_draft_ipc.cpp +++ b/dflash/src/common/dflash_draft_ipc.cpp @@ -17,7 +17,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ── DFlashDraftIpcClient ──────────────────────────────────────────── @@ -248,4 +248,4 @@ bool copy_capture_slice_to_remote_draft( return remote.send_feature_slice(capture_idx, start_pos, n_tokens, host); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_draft_ipc.h b/dflash/src/common/dflash_draft_ipc.h index 26877b4f7..f5eb47ac4 100644 --- a/dflash/src/common/dflash_draft_ipc.h +++ b/dflash/src/common/dflash_draft_ipc.h @@ -31,7 +31,7 @@ # include #endif -namespace dflash27b { +namespace dflash::common { // ── IPC Client (parent process) ───────────────────────────────────── @@ -120,4 +120,4 @@ int run_dflash_draft_ipc_daemon(const char * draft_path, int draft_gpu, int stream_fd); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_draft_ipc_daemon.cpp b/dflash/src/common/dflash_draft_ipc_daemon.cpp index 6653fc436..176798f85 100644 --- a/dflash/src/common/dflash_draft_ipc_daemon.cpp +++ b/dflash/src/common/dflash_draft_ipc_daemon.cpp @@ -24,7 +24,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { int run_dflash_draft_ipc_daemon(const char * draft_path, int ring_cap, @@ -208,4 +208,4 @@ int run_dflash_draft_ipc_daemon(const char * draft_path, #endif } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_feature_ring.cpp b/dflash/src/common/dflash_feature_ring.cpp index f8ab43ea3..8cf8fa83a 100644 --- a/dflash/src/common/dflash_feature_ring.cpp +++ b/dflash/src/common/dflash_feature_ring.cpp @@ -16,7 +16,7 @@ extern "C++" to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); #include "gpu_runtime_compat.h" -namespace dflash27b { +namespace dflash::common { // ── internal helpers ──────────────────────────────────────────── @@ -236,4 +236,4 @@ bool copy_feature_ring_range_to_tensor( return cudaDeviceSynchronize() == cudaSuccess; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_feature_ring.h b/dflash/src/common/dflash_feature_ring.h index 133b69199..e681cecfb 100644 --- a/dflash/src/common/dflash_feature_ring.h +++ b/dflash/src/common/dflash_feature_ring.h @@ -19,7 +19,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { struct DraftFeatureMirror { ggml_context * ctx = nullptr; @@ -88,4 +88,4 @@ bool copy_feature_ring_range_to_tensor( int start_pos, int n_tokens); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_layer_split_runtime.h b/dflash/src/common/dflash_layer_split_runtime.h index 5e302e660..5c539989f 100644 --- a/dflash/src/common/dflash_layer_split_runtime.h +++ b/dflash/src/common/dflash_layer_split_runtime.h @@ -13,7 +13,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -namespace dflash27b { +namespace dflash::common { // ── Runtime configuration (replaces globals) ──────────────────────── @@ -74,4 +74,4 @@ inline bool activation_pair_init(ActivationPair & p, return true; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_spec_decode.cpp b/dflash/src/common/dflash_spec_decode.cpp index aa1831f34..9284a3440 100644 --- a/dflash/src/common/dflash_spec_decode.cpp +++ b/dflash/src/common/dflash_spec_decode.cpp @@ -12,7 +12,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace { // RAII guard so any early `return false` path frees the per-call draft graph. @@ -205,5 +205,5 @@ bool run_dflash_spec_decode( return true; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_spec_decode.h b/dflash/src/common/dflash_spec_decode.h index 1b35735f0..1337e1ff0 100644 --- a/dflash/src/common/dflash_spec_decode.h +++ b/dflash/src/common/dflash_spec_decode.h @@ -21,7 +21,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { struct DraftWeights; // forward-decl from internal.h @@ -50,4 +50,4 @@ bool run_dflash_spec_decode( int stream_fd = -1, DFlashDraftIpcClient * remote_draft = nullptr); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/dflash_target.h b/dflash/src/common/dflash_target.h index 3d563cf15..56fd4bece 100644 --- a/dflash/src/common/dflash_target.h +++ b/dflash/src/common/dflash_target.h @@ -14,7 +14,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { struct DFlashTarget { virtual ~DFlashTarget() = default; @@ -74,4 +74,4 @@ struct DFlashTarget { virtual const std::vector & capture_layer_ids() const = 0; }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/gguf_inspect.cpp b/dflash/src/common/gguf_inspect.cpp index 7baf3d6b2..95cc30c41 100644 --- a/dflash/src/common/gguf_inspect.cpp +++ b/dflash/src/common/gguf_inspect.cpp @@ -5,7 +5,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { GgufModelInfo inspect_gguf_model_info(const char * path) { GgufModelInfo info; @@ -36,4 +36,4 @@ GgufModelInfo inspect_gguf_model_info(const char * path) { return info; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/gguf_inspect.h b/dflash/src/common/gguf_inspect.h index be84c9109..11c11379e 100644 --- a/dflash/src/common/gguf_inspect.h +++ b/dflash/src/common/gguf_inspect.h @@ -7,7 +7,7 @@ #include -namespace dflash27b { +namespace dflash::common { struct GgufModelInfo { std::string arch; // e.g. "qwen35", "laguna", "qwen3", "gemma4" @@ -18,4 +18,4 @@ struct GgufModelInfo { // Returns info with arch="" and n_layer=-1 on failure. GgufModelInfo inspect_gguf_model_info(const char * path); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/io_utils.h b/dflash/src/common/io_utils.h index 8fc14d29d..cf0c4a46f 100644 --- a/dflash/src/common/io_utils.h +++ b/dflash/src/common/io_utils.h @@ -21,7 +21,7 @@ # include #endif -namespace dflash27b { +namespace dflash::common { // ── Binary file I/O ──────────────────────────────────────────────── @@ -134,4 +134,4 @@ static inline int argmax_f32(const float * x, int n) { return best; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/layer_split_utils.cpp b/dflash/src/common/layer_split_utils.cpp index 3c49b4f66..57bf0db49 100644 --- a/dflash/src/common/layer_split_utils.cpp +++ b/dflash/src/common/layer_split_utils.cpp @@ -4,7 +4,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { std::vector> compute_layer_ranges( int n_layer, @@ -86,4 +86,4 @@ std::string validate_device_placement( return {}; // ok } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/layer_split_utils.h b/dflash/src/common/layer_split_utils.h index fcb2ba2e2..ba675947c 100644 --- a/dflash/src/common/layer_split_utils.h +++ b/dflash/src/common/layer_split_utils.h @@ -11,7 +11,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // Compute [begin, end) layer ranges for each GPU shard. // If weights is empty, splits layers equally. @@ -28,4 +28,4 @@ std::string validate_device_placement( const DevicePlacement & dp, int cuda_device_count); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/model_backend.h b/dflash/src/common/model_backend.h index 9a3de0cae..504b68eb0 100644 --- a/dflash/src/common/model_backend.h +++ b/dflash/src/common/model_backend.h @@ -19,7 +19,7 @@ #include "ggml-backend.h" #include "sampler.h" -namespace dflash27b { +namespace dflash::common { // Token callback for streaming generation. Called once per committed token. // Return true to continue generation, false to abort. @@ -180,4 +180,4 @@ struct ModelBackend { virtual void shutdown() = 0; }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/peer_access.cpp b/dflash/src/common/peer_access.cpp index fb0dedf84..9658dce9d 100644 --- a/dflash/src/common/peer_access.cpp +++ b/dflash/src/common/peer_access.cpp @@ -3,7 +3,7 @@ #include -namespace dflash27b { +namespace dflash::common { // ── global state ──────────────────────────────────────────────── bool g_peer_access_opt_in = false; @@ -100,4 +100,4 @@ bool copy_peer_async(void * dst, int dst_device, #endif } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/peer_access.h b/dflash/src/common/peer_access.h index db2ea21fc..f6e2465c7 100644 --- a/dflash/src/common/peer_access.h +++ b/dflash/src/common/peer_access.h @@ -15,7 +15,7 @@ #include "gpu_runtime_compat.h" -namespace dflash27b { +namespace dflash::common { // ── global state ──────────────────────────────────────────────── // Set from argv: opt into cudaMemcpyPeerAsync for cross-device copies. @@ -34,4 +34,4 @@ bool copy_peer_async(void * dst, int dst_device, size_t bytes, cudaStream_t stream = nullptr); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/restore_delta.h b/dflash/src/common/restore_delta.h index 11ee24bb4..3b5814cfc 100644 --- a/dflash/src/common/restore_delta.h +++ b/dflash/src/common/restore_delta.h @@ -4,7 +4,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { inline std::vector restore_prompt_delta(const std::vector & prompt, int cached_prefix_len) { @@ -19,4 +19,4 @@ inline std::vector restore_prompt_delta(const std::vector & pr prompt.end()); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/sampler.cpp b/dflash/src/common/sampler.cpp index a6879f1c7..c05bdd3e2 100644 --- a/dflash/src/common/sampler.cpp +++ b/dflash/src/common/sampler.cpp @@ -8,7 +8,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { int sample_logits(const float * logits_in, int vocab, @@ -95,4 +95,4 @@ bool parse_sampler_token(std::string & line, SamplerCfg & out) { return true; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/sampler.h b/dflash/src/common/sampler.h index 59e1ca131..e7a86dc02 100644 --- a/dflash/src/common/sampler.h +++ b/dflash/src/common/sampler.h @@ -1,6 +1,6 @@ // Shared CPU sampler chain used by both target arches. // -// dflash27b daemon protocol embeds optional sampler params as a tail on each +// dflash::common daemon protocol embeds optional sampler params as a tail on each // generate command: ` samp=temp,top_p,top_k,rep_pen,seed`. parse_sampler_token // strips the tail in place and fills a SamplerCfg; sample_logits applies the // chain rep_penalty -> top_k -> softmax(temp) -> top_p -> draw. @@ -15,7 +15,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { struct SamplerCfg { float temp = 0.0f; @@ -39,4 +39,4 @@ int sample_logits(const float * logits_in, // top_k=0, rep_pen=1, seed=0). bool parse_sampler_token(std::string & line, SamplerCfg & out); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/snapshot_backend.h b/dflash/src/common/snapshot_backend.h index 60636a81f..a77e70a47 100644 --- a/dflash/src/common/snapshot_backend.h +++ b/dflash/src/common/snapshot_backend.h @@ -17,7 +17,7 @@ #include -namespace dflash27b { +namespace dflash::common { // Select or create a backend for prefix cache snapshot storage. // @@ -53,4 +53,4 @@ inline void free_snapshot_backend(ggml_backend_t snap_backend, } } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/common/step_graph.h b/dflash/src/common/step_graph.h index 9ff4f4305..5e18f3f61 100644 --- a/dflash/src/common/step_graph.h +++ b/dflash/src/common/step_graph.h @@ -14,7 +14,7 @@ #include -namespace dflash27b { +namespace dflash::common { struct StepGraph { ggml_context * ctx = nullptr; @@ -66,4 +66,4 @@ inline void step_graph_destroy(StepGraph & sg) { step_graph_free(sg); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/delta_net_chunked.cpp b/dflash/src/delta_net_chunked.cpp index e91bfc862..c3421bf2b 100644 --- a/dflash/src/delta_net_chunked.cpp +++ b/dflash/src/delta_net_chunked.cpp @@ -19,7 +19,7 @@ #include -namespace dflash27b { +namespace dflash::common { static ggml_tensor * get_slice_2d(ggml_context * ctx0, ggml_tensor * t, int64_t c) { return ggml_view_4d(ctx0, t, t->ne[0], t->ne[1], 1, t->ne[3], @@ -234,4 +234,4 @@ DeltaNetChunkedResult build_delta_net_chunked( return r; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/delta_net_chunked.h b/dflash/src/delta_net_chunked.h index 80e6e00d6..173c97d54 100644 --- a/dflash/src/delta_net_chunked.h +++ b/dflash/src/delta_net_chunked.h @@ -4,7 +4,7 @@ #include -namespace dflash27b { +namespace dflash::common { struct DeltaNetChunkedResult { ggml_tensor * output; // [S_v, H_v, n_tokens, n_seqs] @@ -24,4 +24,4 @@ DeltaNetChunkedResult build_delta_net_chunked( ggml_tensor * b, ggml_tensor * s); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/draft/draft_dflash_graph.cpp b/dflash/src/draft/draft_dflash_graph.cpp index b83ccb0bb..eddfba9a1 100644 --- a/dflash/src/draft/draft_dflash_graph.cpp +++ b/dflash/src/draft/draft_dflash_graph.cpp @@ -33,7 +33,7 @@ #include -namespace dflash27b { +namespace dflash::common { DraftGraphOutputs build_draft_graph( ggml_context * ctx, @@ -182,4 +182,4 @@ DraftGraphOutputs build_draft_graph( return og; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/draft/draft_gguf_loader.cpp b/dflash/src/draft/draft_gguf_loader.cpp index 35b503279..89f7b17cf 100644 --- a/dflash/src/draft/draft_gguf_loader.cpp +++ b/dflash/src/draft/draft_gguf_loader.cpp @@ -39,7 +39,7 @@ #include #endif -namespace dflash27b { +namespace dflash::common { namespace { @@ -346,4 +346,4 @@ bool load_draft_gguf(const std::string & path, return true; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/draft/draft_graph.h b/dflash/src/draft/draft_graph.h index 304ff8e39..28bc0d837 100644 --- a/dflash/src/draft/draft_graph.h +++ b/dflash/src/draft/draft_graph.h @@ -3,7 +3,7 @@ #include "ggml.h" -namespace dflash27b { +namespace dflash::common { struct DraftWeights; // fwd @@ -30,4 +30,4 @@ DraftGraphOutputs build_draft_graph( const DraftWeights & w, const DraftGraphInputs & in); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/draft/draft_safetensors_loader.cpp b/dflash/src/draft/draft_safetensors_loader.cpp index 18825d7c6..d1ebcb989 100644 --- a/dflash/src/draft/draft_safetensors_loader.cpp +++ b/dflash/src/draft/draft_safetensors_loader.cpp @@ -55,7 +55,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace { @@ -646,4 +646,4 @@ void free_draft_weights(DraftWeights & w) { w.out_norm = nullptr; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/errors.cpp b/dflash/src/errors.cpp index 7ea8b0c96..970b771fe 100644 --- a/dflash/src/errors.cpp +++ b/dflash/src/errors.cpp @@ -7,7 +7,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace { std::mutex g_err_mu; @@ -19,9 +19,9 @@ void set_last_error(std::string msg) { g_last_error = std::move(msg); } -} // namespace dflash27b +} // namespace dflash::common extern "C" const char * dflash27b_last_error(void) { - std::lock_guard lk(dflash27b::g_err_mu); - return dflash27b::g_last_error.c_str(); + std::lock_guard lk(dflash::common::g_err_mu); + return dflash::common::g_last_error.c_str(); } diff --git a/dflash/src/flashprefill.cpp b/dflash/src/flashprefill.cpp index e83be1e77..0745c66bc 100644 --- a/dflash/src/flashprefill.cpp +++ b/dflash/src/flashprefill.cpp @@ -12,7 +12,7 @@ #include #include "device_runtime.h" -namespace dflash27b { +namespace dflash::common { namespace flashprefill { // Kernel launcher declarations — architecture-specific. @@ -637,4 +637,4 @@ int flash_prefill_forward_f16( } } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/flashprefill.h b/dflash/src/flashprefill.h index ad7c70c0c..1cb0f66d2 100644 --- a/dflash/src/flashprefill.h +++ b/dflash/src/flashprefill.h @@ -32,7 +32,7 @@ #include #include "ggml-backend.h" -namespace dflash27b { +namespace dflash::common { namespace flashprefill { // Algorithmic parameters for the FlashPrefill selection + sparse forward. @@ -98,4 +98,4 @@ extern "C" void dflash_bsa_free_persistent(); #endif } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/flashprefill_f16.cu b/dflash/src/flashprefill_f16.cu index 645904789..80cd0204f 100644 --- a/dflash/src/flashprefill_f16.cu +++ b/dflash/src/flashprefill_f16.cu @@ -19,7 +19,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace flashprefill { // ── Kernel 1: compute_mean_vector (F16) ────────────────────────────── @@ -673,6 +673,6 @@ extern "C" void launch_block_select_f16( } } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 diff --git a/dflash/src/flashprefill_kernels.cu b/dflash/src/flashprefill_kernels.cu index 490f32c49..2dcfc80ce 100644 --- a/dflash/src/flashprefill_kernels.cu +++ b/dflash/src/flashprefill_kernels.cu @@ -39,7 +39,7 @@ #include #endif -namespace dflash27b { +namespace dflash::common { namespace flashprefill { // ── cp.async helpers (sm_8x) ───────────────────────────────────────── @@ -1047,6 +1047,6 @@ extern "C" void launch_block_select( } } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 diff --git a/dflash/src/flashprefill_kernels.hip.cu b/dflash/src/flashprefill_kernels.hip.cu index dc81d1d6d..d7936f378 100644 --- a/dflash/src/flashprefill_kernels.hip.cu +++ b/dflash/src/flashprefill_kernels.hip.cu @@ -23,7 +23,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace flashprefill { // ---- Kernel 1: compute_mean_vector ---- @@ -735,4 +735,4 @@ extern "C" void launch_block_select( // launch_rms_norm_mul_w_f32 is defined in rms_norm_hip.cu (compiled for all HIP builds). } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/flashprefill_q8.cpp b/dflash/src/flashprefill_q8.cpp index df876d5b7..e3e2737b2 100644 --- a/dflash/src/flashprefill_q8.cpp +++ b/dflash/src/flashprefill_q8.cpp @@ -21,7 +21,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace flashprefill { namespace { @@ -166,4 +166,4 @@ int flash_prefill_forward_q8( } } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/flashprefill_scalar.cu b/dflash/src/flashprefill_scalar.cu index 6d7ffa5bd..1d90fcbce 100644 --- a/dflash/src/flashprefill_scalar.cu +++ b/dflash/src/flashprefill_scalar.cu @@ -19,7 +19,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace flashprefill { // ============================================================================= @@ -512,6 +512,6 @@ extern "C" void launch_block_select_pascal( } } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common #endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600 && __CUDA_ARCH__ < 700) diff --git a/dflash/src/flashprefill_select.cpp b/dflash/src/flashprefill_select.cpp index a18b05d3e..4480d8e40 100644 --- a/dflash/src/flashprefill_select.cpp +++ b/dflash/src/flashprefill_select.cpp @@ -18,7 +18,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace flashprefill { // score: [B, M, N, H] row-major (B outer, H fastest). @@ -84,4 +84,4 @@ void block_select_host( } } // namespace flashprefill -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/gemma4/gemma4_backend.cpp b/dflash/src/gemma4/gemma4_backend.cpp index ea33beed5..5b6a0725c 100644 --- a/dflash/src/gemma4/gemma4_backend.cpp +++ b/dflash/src/gemma4/gemma4_backend.cpp @@ -17,7 +17,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ── Ctor / dtor ──────────────────────────────────────────────────────── @@ -370,4 +370,4 @@ void Gemma4Backend::shutdown() { std::printf("[gemma4] shutdown\n"); std::fflush(stdout); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/gemma4/gemma4_backend.h b/dflash/src/gemma4/gemma4_backend.h index 312466bc8..84fa08b35 100644 --- a/dflash/src/gemma4/gemma4_backend.h +++ b/dflash/src/gemma4/gemma4_backend.h @@ -17,7 +17,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { struct Gemma4BackendConfig { const char * model_path = nullptr; @@ -89,4 +89,4 @@ class Gemma4Backend : public ModelBackend { const DaemonIO & io); }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/gemma4/gemma4_daemon.cpp b/dflash/src/gemma4/gemma4_daemon.cpp index e3a495633..8c653e001 100644 --- a/dflash/src/gemma4/gemma4_daemon.cpp +++ b/dflash/src/gemma4/gemma4_daemon.cpp @@ -6,7 +6,7 @@ #include -namespace dflash27b { +namespace dflash::common { int run_gemma4_daemon(const Gemma4DaemonArgs & args) { Gemma4BackendConfig cfg; @@ -26,4 +26,4 @@ int run_gemma4_daemon(const Gemma4DaemonArgs & args) { return run_daemon(backend, da); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/gemma4/gemma4_daemon.h b/dflash/src/gemma4/gemma4_daemon.h index 46cd960a8..7f753b7bf 100644 --- a/dflash/src/gemma4/gemma4_daemon.h +++ b/dflash/src/gemma4/gemma4_daemon.h @@ -5,7 +5,7 @@ #include "device_placement.h" #include -namespace dflash27b { +namespace dflash::common { struct Gemma4DaemonArgs { const char * model_path = nullptr; @@ -17,4 +17,4 @@ struct Gemma4DaemonArgs { int run_gemma4_daemon(const Gemma4DaemonArgs & args); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/gemma4/gemma4_graph.cpp b/dflash/src/gemma4/gemma4_graph.cpp index a630c9fe8..c4522edd7 100644 --- a/dflash/src/gemma4/gemma4_graph.cpp +++ b/dflash/src/gemma4/gemma4_graph.cpp @@ -28,7 +28,7 @@ #include "ggml-cuda.h" #include "ggml-alloc.h" -namespace dflash27b { +namespace dflash::common { static constexpr float GEMMA4_EPS = 1e-6f; @@ -445,4 +445,4 @@ bool gemma4_step( return true; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/gemma4/gemma4_internal.h b/dflash/src/gemma4/gemma4_internal.h index f4f49c4e2..d34107b75 100644 --- a/dflash/src/gemma4/gemma4_internal.h +++ b/dflash/src/gemma4/gemma4_internal.h @@ -19,7 +19,7 @@ #include "internal.h" // CpuEmbedder -namespace dflash27b { +namespace dflash::common { struct Gemma4Layer { // Pre-attn norm @@ -181,4 +181,4 @@ bool gemma4_step( int kv_start, std::vector & out_logits); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/gemma4/gemma4_loader.cpp b/dflash/src/gemma4/gemma4_loader.cpp index de5621e60..d40db53a4 100644 --- a/dflash/src/gemma4/gemma4_loader.cpp +++ b/dflash/src/gemma4/gemma4_loader.cpp @@ -27,7 +27,7 @@ #include #endif -namespace dflash27b { +namespace dflash::common { namespace { @@ -367,4 +367,4 @@ void free_gemma4_snapshot(Gemma4Snapshot & s) { s.cur_pos = 0; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/internal.h b/dflash/src/internal.h index 05bc6823d..6f5666df6 100644 --- a/dflash/src/internal.h +++ b/dflash/src/internal.h @@ -1,4 +1,4 @@ -// Internal-only shared header for dflash27b library sources. +// Internal-only shared header for dflash::common library sources. // Not installed, not exposed in the public API. #pragma once @@ -24,7 +24,7 @@ #include "dflash27b.h" -namespace dflash27b { +namespace dflash::common { // Single source of truth for error reporting. // All loaders / graph builders push into this via set_last_error(...). @@ -536,7 +536,7 @@ ggml_tensor * build_qwen35_layer( ggml_tensor * q_tail_capture = nullptr, int q_tail_start = 0); -} // namespace dflash27b +} // namespace dflash::common #if defined(GGML_USE_CUDA) && !defined(GGML_USE_HIP) #include diff --git a/dflash/src/kv_cache.cpp b/dflash/src/kv_cache.cpp index b2006eaa0..0ffe21834 100644 --- a/dflash/src/kv_cache.cpp +++ b/dflash/src/kv_cache.cpp @@ -19,8 +19,8 @@ #include "internal.h" -namespace dflash27b { +namespace dflash::common { // Placeholder; real impl lives with the spec_loop driver. -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/kv_quant.cpp b/dflash/src/kv_quant.cpp index f2a51c648..676a103d2 100644 --- a/dflash/src/kv_quant.cpp +++ b/dflash/src/kv_quant.cpp @@ -1,4 +1,4 @@ -// KV-cache quantisation helpers for dflash27b. +// KV-cache quantisation helpers for dflash::common. // // Centralises the supported (K, V) ggml_type pair table and environment-variable // resolution that was previously inlined in qwen35_target_graph.cpp. diff --git a/dflash/src/laguna/laguna_backend.cpp b/dflash/src/laguna/laguna_backend.cpp index 9781fdb49..ca64ab943 100644 --- a/dflash/src/laguna/laguna_backend.cpp +++ b/dflash/src/laguna/laguna_backend.cpp @@ -21,7 +21,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ── Construction / initialisation ─────────────────────────────────────── @@ -398,7 +398,7 @@ bool LagunaBackend::handle_compress(const std::string & line, void LagunaBackend::free_drafter() { if (drafter_loaded_) { - dflash27b::free_drafter(drafter_ctx_); + dflash::common::free_drafter(drafter_ctx_); drafter_loaded_ = false; std::printf("[drafter] freed\n"); std::fflush(stdout); } @@ -409,7 +409,7 @@ void LagunaBackend::free_drafter() { void LagunaBackend::shutdown() { for (auto & snap : snapshots_) laguna_snapshot_free(snap); if (drafter_loaded_) { - dflash27b::free_drafter(drafter_ctx_); + dflash::common::free_drafter(drafter_ctx_); drafter_loaded_ = false; } if (!target_parked_) { @@ -424,4 +424,4 @@ void LagunaBackend::shutdown() { } } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/laguna/laguna_backend.h b/dflash/src/laguna/laguna_backend.h index 387e758a6..7e487d558 100644 --- a/dflash/src/laguna/laguna_backend.h +++ b/dflash/src/laguna/laguna_backend.h @@ -18,7 +18,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { struct LagunaBackendArgs { std::string target_path; @@ -78,4 +78,4 @@ class LagunaBackend : public ModelBackend { bool ensure_slot(int slot); }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/laguna/laguna_daemon.cpp b/dflash/src/laguna/laguna_daemon.cpp index 06ec21458..3a64e0b8d 100644 --- a/dflash/src/laguna/laguna_daemon.cpp +++ b/dflash/src/laguna/laguna_daemon.cpp @@ -16,7 +16,7 @@ #include -namespace dflash27b { +namespace dflash::common { int run_laguna_daemon(const LagunaDaemonArgs & args) { LagunaBackendArgs bargs; @@ -36,4 +36,4 @@ int run_laguna_daemon(const LagunaDaemonArgs & args) { return run_daemon(backend, dargs); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/laguna/laguna_daemon.h b/dflash/src/laguna/laguna_daemon.h index 53a78566b..527582776 100644 --- a/dflash/src/laguna/laguna_daemon.h +++ b/dflash/src/laguna/laguna_daemon.h @@ -17,7 +17,7 @@ #include #include "ggml.h" -namespace dflash27b { +namespace dflash::common { struct LagunaDaemonArgs { std::string target_path; // path to laguna-*.gguf @@ -37,4 +37,4 @@ struct LagunaDaemonArgs { // `exit`, or EOF. Returns the process exit code (0 on clean shutdown). int run_laguna_daemon(const LagunaDaemonArgs & args); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/laguna/laguna_internal.h b/dflash/src/laguna/laguna_internal.h index a98d7a8b5..7c28c43ae 100644 --- a/dflash/src/laguna/laguna_internal.h +++ b/dflash/src/laguna/laguna_internal.h @@ -30,7 +30,7 @@ #include "internal.h" // for CpuEmbedder -namespace dflash27b { +namespace dflash::common { struct LagunaTargetLayer { // Pre-attn + pre-ffn norms (Laguna has only these two; no post norms). @@ -252,4 +252,4 @@ bool laguna_step( bool no_mask, std::vector & out_logits); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/laguna/laguna_target_graph.cpp b/dflash/src/laguna/laguna_target_graph.cpp index 7d3c21788..8f2e3c638 100644 --- a/dflash/src/laguna/laguna_target_graph.cpp +++ b/dflash/src/laguna/laguna_target_graph.cpp @@ -33,7 +33,7 @@ #include "ggml-cuda.h" #include "ggml-alloc.h" -namespace dflash27b { +namespace dflash::common { static constexpr float LAGUNA_EPS = 1e-6f; @@ -785,4 +785,4 @@ bool laguna_step( return true; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/laguna/laguna_target_loader.cpp b/dflash/src/laguna/laguna_target_loader.cpp index 77e0acb58..c3ca44469 100644 --- a/dflash/src/laguna/laguna_target_loader.cpp +++ b/dflash/src/laguna/laguna_target_loader.cpp @@ -55,7 +55,7 @@ #include #endif -namespace dflash27b { +namespace dflash::common { namespace { @@ -473,4 +473,4 @@ void free_laguna_target_weights(LagunaTargetWeights & w) { w.output = nullptr; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/pflash_ggml_adapter.cpp b/dflash/src/pflash_ggml_adapter.cpp index 4862d379b..93434761f 100644 --- a/dflash/src/pflash_ggml_adapter.cpp +++ b/dflash/src/pflash_ggml_adapter.cpp @@ -11,7 +11,7 @@ static int pflash_adapter( int batch, int seq_len, int n_q_heads, int n_k_heads, int head_dim, float scale, float alpha) { - dflash27b::flashprefill::FlashPrefillConfig cfg; + dflash::common::flashprefill::FlashPrefillConfig cfg; if (alpha >= 1.0f) { // alpha >= 1.0 means "select all blocks" — configure for dense attention cfg.alpha = 0.0f; @@ -21,7 +21,7 @@ static int pflash_adapter( } else { cfg.alpha = alpha; } - return dflash27b::flashprefill::flash_prefill_forward_bf16( + return dflash::common::flashprefill::flash_prefill_forward_bf16( Q, K, V, O, batch, seq_len, n_q_heads, n_k_heads, head_dim, scale, cfg); diff --git a/dflash/src/qwen3/qwen3_backend.cpp b/dflash/src/qwen3/qwen3_backend.cpp index d95b41413..58c476d42 100644 --- a/dflash/src/qwen3/qwen3_backend.cpp +++ b/dflash/src/qwen3/qwen3_backend.cpp @@ -20,7 +20,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ── Cache management ─────────────────────────────────────────────────── @@ -1031,7 +1031,7 @@ bool Qwen3Backend::handle_compress(const std::string & line, const DaemonIO & io void Qwen3Backend::free_drafter() { if (drafter_loaded_) { - dflash27b::free_drafter(drafter_ctx_); + dflash::common::free_drafter(drafter_ctx_); drafter_loaded_ = false; } } @@ -1060,4 +1060,4 @@ void Qwen3Backend::shutdown() { } } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen3/qwen3_backend.h b/dflash/src/qwen3/qwen3_backend.h index bae81f52c..7d8bd0b0c 100644 --- a/dflash/src/qwen3/qwen3_backend.h +++ b/dflash/src/qwen3/qwen3_backend.h @@ -24,7 +24,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { struct Qwen3BackendConfig { const char * model_path = nullptr; @@ -144,4 +144,4 @@ class Qwen3Backend : public ModelBackend { std::vector last_logits_; // logits from last prefill chunk }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen3/qwen3_daemon.cpp b/dflash/src/qwen3/qwen3_daemon.cpp index 9f478995e..f5fd59132 100644 --- a/dflash/src/qwen3/qwen3_daemon.cpp +++ b/dflash/src/qwen3/qwen3_daemon.cpp @@ -6,7 +6,7 @@ #include -namespace dflash27b { +namespace dflash::common { int run_qwen3_daemon(const Qwen3DaemonArgs & args) { Qwen3BackendConfig cfg; @@ -26,4 +26,4 @@ int run_qwen3_daemon(const Qwen3DaemonArgs & args) { return run_daemon(backend, dargs); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen3/qwen3_daemon.h b/dflash/src/qwen3/qwen3_daemon.h index 1c457d427..2478a4c23 100644 --- a/dflash/src/qwen3/qwen3_daemon.h +++ b/dflash/src/qwen3/qwen3_daemon.h @@ -5,7 +5,7 @@ #include "device_placement.h" #include -namespace dflash27b { +namespace dflash::common { struct Qwen3DaemonArgs { const char * model_path = nullptr; @@ -17,4 +17,4 @@ struct Qwen3DaemonArgs { int run_qwen3_daemon(const Qwen3DaemonArgs & args); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen3/qwen3_drafter.cpp b/dflash/src/qwen3/qwen3_drafter.cpp index 8f242eb54..c5e31fb3e 100644 --- a/dflash/src/qwen3/qwen3_drafter.cpp +++ b/dflash/src/qwen3/qwen3_drafter.cpp @@ -30,7 +30,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace { @@ -795,4 +795,4 @@ std::vector drafter_score_and_compress( return out; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen3/qwen3_drafter.h b/dflash/src/qwen3/qwen3_drafter.h index b0687dadb..84b028a8f 100644 --- a/dflash/src/qwen3/qwen3_drafter.h +++ b/dflash/src/qwen3/qwen3_drafter.h @@ -23,7 +23,7 @@ struct ggml_backend; typedef struct ggml_backend * ggml_backend_t; -namespace dflash27b { +namespace dflash::common { enum class DrafterArch { Qwen3_0p6b, @@ -76,4 +76,4 @@ std::vector drafter_score_and_compress( int n_lookahead = 8, int pool_kernel = 13); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen3/qwen3_drafter_model.h b/dflash/src/qwen3/qwen3_drafter_model.h index bc0664e06..8a1b2d709 100644 --- a/dflash/src/qwen3/qwen3_drafter_model.h +++ b/dflash/src/qwen3/qwen3_drafter_model.h @@ -25,7 +25,7 @@ typedef struct ggml_backend * ggml_backend_t; struct ggml_backend_buffer; typedef struct ggml_backend_buffer * ggml_backend_buffer_t; -namespace dflash27b { +namespace dflash::common { struct Qwen3DrafterLayer { ggml_tensor * attn_norm = nullptr; // [hidden] @@ -89,4 +89,4 @@ bool forward_qwen3_drafter_model( int n_lookahead, std::vector & running_max); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen3/qwen3_graph.cpp b/dflash/src/qwen3/qwen3_graph.cpp index c3c1238f5..c907546f3 100644 --- a/dflash/src/qwen3/qwen3_graph.cpp +++ b/dflash/src/qwen3/qwen3_graph.cpp @@ -51,7 +51,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { namespace { @@ -865,4 +865,4 @@ bool forward_qwen3_drafter_model( return true; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen3/qwen3_loader.cpp b/dflash/src/qwen3/qwen3_loader.cpp index 5ab6138eb..009ff78d7 100644 --- a/dflash/src/qwen3/qwen3_loader.cpp +++ b/dflash/src/qwen3/qwen3_loader.cpp @@ -42,7 +42,7 @@ #include #endif -namespace dflash27b { +namespace dflash::common { namespace { @@ -250,4 +250,4 @@ void free_qwen3_drafter_model(Qwen3DrafterWeights & w) { w.backend = nullptr; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/gguf_target_loader.cpp b/dflash/src/qwen35/gguf_target_loader.cpp index 1499325de..62e209ebf 100644 --- a/dflash/src/qwen35/gguf_target_loader.cpp +++ b/dflash/src/qwen35/gguf_target_loader.cpp @@ -61,7 +61,7 @@ #include #endif -namespace dflash27b { +namespace dflash::common { // CpuEmbedder destructor + embed() method CpuEmbedder::~CpuEmbedder() { @@ -683,4 +683,4 @@ void free_target_weights(TargetWeights & w) { w.output = nullptr; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/graph_builders.cpp b/dflash/src/qwen35/graph_builders.cpp index 035ffb61e..c1f51cdbf 100644 --- a/dflash/src/qwen35/graph_builders.cpp +++ b/dflash/src/qwen35/graph_builders.cpp @@ -4,7 +4,7 @@ #include -namespace dflash27b { +namespace dflash::common { // ── build_layer_step ──────────────────────────────────────────── @@ -260,4 +260,4 @@ bool build_lm_head_projection_step( return ggml_gallocr_alloc_graph(sg.alloc, sg.gf); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/graph_builders.h b/dflash/src/qwen35/graph_builders.h index e820f4376..323e8e3d2 100644 --- a/dflash/src/qwen35/graph_builders.h +++ b/dflash/src/qwen35/graph_builders.h @@ -21,7 +21,7 @@ #include "ggml.h" #include "ggml-backend.h" -namespace dflash27b { +namespace dflash::common { // Layer-segmented prefill: process one target layer for chunk_start..chunk_start+n_tokens. bool build_layer_step( @@ -73,4 +73,4 @@ bool build_lm_head_projection_step( ggml_backend_t backend, int n_tokens); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/layer_split_daemon.cpp b/dflash/src/qwen35/layer_split_daemon.cpp index 086e19dc6..c8c107bf9 100644 --- a/dflash/src/qwen35/layer_split_daemon.cpp +++ b/dflash/src/qwen35/layer_split_daemon.cpp @@ -12,7 +12,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { bool run_target_layer_split_request( std::vector & shards, @@ -87,4 +87,4 @@ bool run_target_layer_split_request( return true; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/layer_split_daemon.h b/dflash/src/qwen35/layer_split_daemon.h index c58785248..3d87a3313 100644 --- a/dflash/src/qwen35/layer_split_daemon.h +++ b/dflash/src/qwen35/layer_split_daemon.h @@ -16,7 +16,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // Handle one inference request over a set of layer-split shards. // Runs prefill, then either spec-decode (if run_dflash && draft available) @@ -38,4 +38,4 @@ bool run_target_layer_split_request( int draft_ctx_max, int stream_fd = -1); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/layer_split_daemon_loop.cpp b/dflash/src/qwen35/layer_split_daemon_loop.cpp index 247971e6b..6790f5510 100644 --- a/dflash/src/qwen35/layer_split_daemon_loop.cpp +++ b/dflash/src/qwen35/layer_split_daemon_loop.cpp @@ -20,7 +20,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { int run_layer_split_daemon(const LayerSplitDaemonConfig & cfg) { const auto info = inspect_gguf_model_info(cfg.target_path); @@ -192,4 +192,4 @@ int run_layer_split_daemon(const LayerSplitDaemonConfig & cfg) { return 0; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/layer_split_daemon_loop.h b/dflash/src/qwen35/layer_split_daemon_loop.h index 015ccad4a..2215f00ba 100644 --- a/dflash/src/qwen35/layer_split_daemon_loop.h +++ b/dflash/src/qwen35/layer_split_daemon_loop.h @@ -19,7 +19,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { struct LayerSplitDaemonConfig { const char * target_path = nullptr; @@ -43,4 +43,4 @@ struct LayerSplitDaemonConfig { // Run the layer-split daemon event loop. Returns exit code (0 = success). int run_layer_split_daemon(const LayerSplitDaemonConfig & cfg); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/layer_split_forward.cpp b/dflash/src/qwen35/layer_split_forward.cpp index 5f6b3f9f7..985be9502 100644 --- a/dflash/src/qwen35/layer_split_forward.cpp +++ b/dflash/src/qwen35/layer_split_forward.cpp @@ -15,7 +15,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { bool compute_target_split_argmax( StepGraph & sg, @@ -233,4 +233,4 @@ void free_target_layer_split_shards(std::vector & shards) shards.clear(); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/layer_split_forward.h b/dflash/src/qwen35/layer_split_forward.h index cc032c1f3..7bf0b9222 100644 --- a/dflash/src/qwen35/layer_split_forward.h +++ b/dflash/src/qwen35/layer_split_forward.h @@ -17,7 +17,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // Compute argmax(logits) for a slice of the activation tensor via // out_norm + lm_head projection. @@ -52,4 +52,4 @@ bool run_target_layer_split_forward( // Free all shards (weights, cache, backend). void free_target_layer_split_shards(std::vector & shards); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/layer_split_types.h b/dflash/src/qwen35/layer_split_types.h index 3e4929251..3920d6f3c 100644 --- a/dflash/src/qwen35/layer_split_types.h +++ b/dflash/src/qwen35/layer_split_types.h @@ -17,7 +17,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ── Per-GPU shard for layer-split target ──────────────────────────── @@ -41,4 +41,4 @@ inline TargetLayerSplitShard * find_target_shard( return nullptr; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_backend.cpp b/dflash/src/qwen35/qwen35_backend.cpp index ecd8f745e..8b08d69e6 100644 --- a/dflash/src/qwen35/qwen35_backend.cpp +++ b/dflash/src/qwen35/qwen35_backend.cpp @@ -20,7 +20,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { #define IS_EOS_TOK(tok, w) \ ( ((w).eos_chat_id >= 0 && (tok) == (w).eos_chat_id) \ @@ -370,7 +370,7 @@ bool Qwen35Backend::handle_compress(const std::string & line, const DaemonIO & i void Qwen35Backend::free_drafter() { if (drafter_loaded_) { // Drafter has its own backend — do a full free (weights + backend) - dflash27b::free_drafter(drafter_ctx_); + dflash::common::free_drafter(drafter_ctx_); drafter_loaded_ = false; std::printf("[drafter] freed\n"); std::fflush(stdout); } @@ -975,4 +975,4 @@ int Qwen35Backend::verify_tree(int committed, const DDTree & tree) { return 0; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_backend.h b/dflash/src/qwen35/qwen35_backend.h index 18e444b2c..d87f5f0b8 100644 --- a/dflash/src/qwen35/qwen35_backend.h +++ b/dflash/src/qwen35/qwen35_backend.h @@ -28,7 +28,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ── Configuration passed at construction ──────────────────────────────── @@ -182,4 +182,4 @@ class Qwen35Backend : public ModelBackend { int verify_tree(int committed, const DDTree & tree); }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_daemon.cpp b/dflash/src/qwen35/qwen35_daemon.cpp index b67849e6b..d1a14a915 100644 --- a/dflash/src/qwen35/qwen35_daemon.cpp +++ b/dflash/src/qwen35/qwen35_daemon.cpp @@ -10,7 +10,7 @@ #include -namespace dflash27b { +namespace dflash::common { int run_qwen35_daemon(const Qwen35DaemonArgs & args) { Qwen35Config cfg; @@ -42,4 +42,4 @@ int run_qwen35_daemon(const Qwen35DaemonArgs & args) { return run_daemon(backend, dargs); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_daemon.h b/dflash/src/qwen35/qwen35_daemon.h index 52786b577..2135eca3f 100644 --- a/dflash/src/qwen35/qwen35_daemon.h +++ b/dflash/src/qwen35/qwen35_daemon.h @@ -8,7 +8,7 @@ #include "device_placement.h" #include -namespace dflash27b { +namespace dflash::common { struct Qwen35DaemonArgs { const char * target_path = nullptr; @@ -39,4 +39,4 @@ struct Qwen35DaemonArgs { // Run the qwen35 daemon loop. Returns 0 on clean exit, 1 on init failure. int run_qwen35_daemon(const Qwen35DaemonArgs & args); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_dflash_target.cpp b/dflash/src/qwen35/qwen35_dflash_target.cpp index 49d13e327..65713d1bb 100644 --- a/dflash/src/qwen35/qwen35_dflash_target.cpp +++ b/dflash/src/qwen35/qwen35_dflash_target.cpp @@ -5,7 +5,7 @@ #include "step_graph.h" #include "attn_masks.h" -namespace dflash27b { +namespace dflash::common { Qwen35DFlashTarget::~Qwen35DFlashTarget() { step_graph_destroy(proj_sg_); @@ -149,4 +149,4 @@ const std::vector & Qwen35DFlashTarget::capture_layer_ids() const { return capture_ids_; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_dflash_target.h b/dflash/src/qwen35/qwen35_dflash_target.h index ec2d609c6..6a72e48b5 100644 --- a/dflash/src/qwen35/qwen35_dflash_target.h +++ b/dflash/src/qwen35/qwen35_dflash_target.h @@ -16,7 +16,7 @@ #include -namespace dflash27b { +namespace dflash::common { class Qwen35DFlashTarget : public DFlashTarget { public: @@ -68,4 +68,4 @@ class Qwen35DFlashTarget : public DFlashTarget { StepGraph proj_sg_; }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_layer_split.h b/dflash/src/qwen35/qwen35_layer_split.h index 2fb487654..1d3358043 100644 --- a/dflash/src/qwen35/qwen35_layer_split.h +++ b/dflash/src/qwen35/qwen35_layer_split.h @@ -17,7 +17,7 @@ #include -namespace dflash27b { +namespace dflash::common { struct Qwen35LayerSplitDaemonArgs { const char * target_path = nullptr; @@ -35,4 +35,4 @@ struct Qwen35LayerSplitDaemonArgs { // Will be fully implemented here once helpers are extracted. int run_qwen35_layer_split_daemon(const Qwen35LayerSplitDaemonArgs & args); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp b/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp index 39d87c273..76daca64f 100644 --- a/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp +++ b/dflash/src/qwen35/qwen35_layer_split_dflash_target.cpp @@ -6,7 +6,7 @@ #include "graph_builders.h" #include "step_graph.h" -namespace dflash27b { +namespace dflash::common { Qwen35LayerSplitDFlashTarget::~Qwen35LayerSplitDFlashTarget() { step_graph_destroy(proj_sg_); @@ -104,4 +104,4 @@ const std::vector & Qwen35LayerSplitDFlashTarget::capture_layer_ids() const return capture_ids_; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_layer_split_dflash_target.h b/dflash/src/qwen35/qwen35_layer_split_dflash_target.h index f6f149b16..f593009d8 100644 --- a/dflash/src/qwen35/qwen35_layer_split_dflash_target.h +++ b/dflash/src/qwen35/qwen35_layer_split_dflash_target.h @@ -21,7 +21,7 @@ #include -namespace dflash27b { +namespace dflash::common { class Qwen35LayerSplitDFlashTarget : public DFlashTarget { public: @@ -66,4 +66,4 @@ class Qwen35LayerSplitDFlashTarget : public DFlashTarget { StepGraph proj_sg_; }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/qwen35/qwen35_target_graph.cpp b/dflash/src/qwen35/qwen35_target_graph.cpp index 9c7871632..fdb3a9141 100644 --- a/dflash/src/qwen35/qwen35_target_graph.cpp +++ b/dflash/src/qwen35/qwen35_target_graph.cpp @@ -39,7 +39,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ─── Local qwen35 constants (from the GGUF, hardcoded for this model) ─ // These complement the DFLASH27B_* macros in dflash27b.h with qwen35-specific @@ -1544,4 +1544,4 @@ bool restore_target_cache_chain(const PrefixSnapshot * thick, } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/api_types.h b/dflash/src/server/api_types.h index e5803bf03..9fa187997 100644 --- a/dflash/src/server/api_types.h +++ b/dflash/src/server/api_types.h @@ -1,8 +1,8 @@ // Shared types for the server components. #pragma once -namespace dflash27b { +namespace dflash::common { enum class ApiFormat { OPENAI_CHAT, ANTHROPIC, RESPONSES, COMPLETIONS }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/chat_template.cpp b/dflash/src/server/chat_template.cpp index 758502888..92c465880 100644 --- a/dflash/src/server/chat_template.cpp +++ b/dflash/src/server/chat_template.cpp @@ -2,7 +2,7 @@ #include "chat_template.h" -namespace dflash27b { +namespace dflash::common { // Qwen3.5 tool preamble — matches the official Jinja template exactly. static const char QWEN3_TOOL_PREAMBLE[] = @@ -155,4 +155,4 @@ std::string render_chat_template( return result; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/chat_template.h b/dflash/src/server/chat_template.h index d6c34b254..5f35f4925 100644 --- a/dflash/src/server/chat_template.h +++ b/dflash/src/server/chat_template.h @@ -1,4 +1,4 @@ -// Chat template renderer for dflash27b native server. +// Chat template renderer for dflash::common native server. // // Renders chat messages (system/user/assistant/tool) into the model-specific // token format. Hard-coded for supported architectures: @@ -10,7 +10,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // A single message in a chat conversation. struct ChatMessage { @@ -49,4 +49,4 @@ std::string render_chat_template( // Detect the appropriate chat format for an architecture. ChatFormat chat_format_for_arch(const std::string & arch); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/disk_prefix_cache.cpp b/dflash/src/server/disk_prefix_cache.cpp index c60d5ca76..ca62469fc 100644 --- a/dflash/src/server/disk_prefix_cache.cpp +++ b/dflash/src/server/disk_prefix_cache.cpp @@ -16,7 +16,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ─── Inline SHA-1 (same as prefix_cache.cpp) ──────────────────────────── @@ -729,4 +729,4 @@ bool DiskPrefixCache::read_header(FILE * f, DiskCacheHeader & hdr) { return true; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/disk_prefix_cache.h b/dflash/src/server/disk_prefix_cache.h index 508f5a07f..d4bcd7d49 100644 --- a/dflash/src/server/disk_prefix_cache.h +++ b/dflash/src/server/disk_prefix_cache.h @@ -21,7 +21,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ─── Configuration ────────────────────────────────────────────────────── @@ -166,4 +166,4 @@ class DiskPrefixCache { static bool read_header(FILE * f, DiskCacheHeader & hdr); }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/http_server.cpp b/dflash/src/server/http_server.cpp index 57f27ed80..5d61da306 100644 --- a/dflash/src/server/http_server.cpp +++ b/dflash/src/server/http_server.cpp @@ -21,7 +21,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ─── Utilities ────────────────────────────────────────────────────────── @@ -1086,4 +1086,4 @@ bool HttpServer::send_sse_headers(int fd) { return send_all(fd, header.data(), header.size()); } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/http_server.h b/dflash/src/server/http_server.h index c879d608d..8c0ec9eb7 100644 --- a/dflash/src/server/http_server.h +++ b/dflash/src/server/http_server.h @@ -1,4 +1,4 @@ -// HTTP server infrastructure for dflash27b native server. +// HTTP server infrastructure for dflash::common native server. // // Ported from ds4_server.c's socket/threading/HTTP layer, converted to C++. // Architecture: @@ -32,7 +32,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { using json = nlohmann::json; @@ -186,4 +186,4 @@ struct ServerJob { ServerJob * next = nullptr; }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/prefix_cache.cpp b/dflash/src/server/prefix_cache.cpp index 6765b97ad..72ceae720 100644 --- a/dflash/src/server/prefix_cache.cpp +++ b/dflash/src/server/prefix_cache.cpp @@ -7,7 +7,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ─── Chat marker resolution ──────────────────────────────────────────── @@ -424,4 +424,4 @@ void PrefixCache::abort_full_snap(int /*slot*/) { full_has_pending_evict_ = false; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/prefix_cache.h b/dflash/src/server/prefix_cache.h index a3451ee97..cb0c551bd 100644 --- a/dflash/src/server/prefix_cache.h +++ b/dflash/src/server/prefix_cache.h @@ -21,7 +21,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ─── Chat marker detection ────────────────────────────────────────────── @@ -142,4 +142,4 @@ class PrefixCache { void move_full_to_end(int idx); }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/reasoning.cpp b/dflash/src/server/reasoning.cpp index 3b084e5e3..aec8912a8 100644 --- a/dflash/src/server/reasoning.cpp +++ b/dflash/src/server/reasoning.cpp @@ -2,7 +2,7 @@ #include "reasoning.h" -namespace dflash27b { +namespace dflash::common { static const char THINK_OPEN[] = ""; static const char THINK_CLOSE[] = ""; @@ -85,4 +85,4 @@ ReasoningResult parse_reasoning(const std::string & text, return result; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/reasoning.h b/dflash/src/server/reasoning.h index 95ea3046e..a7c1f8673 100644 --- a/dflash/src/server/reasoning.h +++ b/dflash/src/server/reasoning.h @@ -7,7 +7,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { struct ReasoningResult { std::string content; // cleaned content (think tags removed) @@ -22,4 +22,4 @@ ReasoningResult parse_reasoning(const std::string & text, bool thinking_enabled = true, bool started_in_thinking = false); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/server_main.cpp b/dflash/src/server/server_main.cpp index d1ba7cdc3..319f97ded 100644 --- a/dflash/src/server/server_main.cpp +++ b/dflash/src/server/server_main.cpp @@ -1,4 +1,4 @@ -// dflash_server — native C++ HTTP server for dflash27b. +// dflash_server — native C++ HTTP server for dflash::common. // // Replaces the Python server.py for production use. Owns the ModelBackend // directly (no subprocess, no pipe protocol), enabling: @@ -22,7 +22,7 @@ #include #include -using namespace dflash27b; +using namespace dflash::common; // Global server pointer for signal handling. static HttpServer * g_server = nullptr; diff --git a/dflash/src/server/sse_emitter.cpp b/dflash/src/server/sse_emitter.cpp index 26f4158ce..59a5fe7d0 100644 --- a/dflash/src/server/sse_emitter.cpp +++ b/dflash/src/server/sse_emitter.cpp @@ -7,7 +7,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { static const char THINK_OPEN[] = ""; static const char THINK_CLOSE[] = ""; @@ -563,4 +563,4 @@ std::string SseEmitter::finish_reason() const { return tool_calls_.empty() ? "stop" : "tool_calls"; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/sse_emitter.h b/dflash/src/server/sse_emitter.h index 0212ec878..db5afc220 100644 --- a/dflash/src/server/sse_emitter.h +++ b/dflash/src/server/sse_emitter.h @@ -16,7 +16,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { using json = nlohmann::json; @@ -108,4 +108,4 @@ class SseEmitter { static constexpr size_t HOLDBACK = 12; // max(len(""), len(""), len("")) }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/tokenizer.cpp b/dflash/src/server/tokenizer.cpp index 5619d69f2..1f538682b 100644 --- a/dflash/src/server/tokenizer.cpp +++ b/dflash/src/server/tokenizer.cpp @@ -16,7 +16,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ─── Unicode helpers ──────────────────────────────────────────────────── @@ -714,4 +714,4 @@ int32_t Tokenizer::token_to_id(const std::string & token) const { return it != token_to_id_.end() ? it->second : -1; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/tokenizer.h b/dflash/src/server/tokenizer.h index ff516f28c..f28dfd9fc 100644 --- a/dflash/src/server/tokenizer.h +++ b/dflash/src/server/tokenizer.h @@ -1,4 +1,4 @@ -// BPE tokenizer for dflash27b native server. +// BPE tokenizer for dflash::common native server. // // Loads vocabulary (token strings) and merge rules from GGUF metadata, // then provides encode (text → token IDs) and decode (token IDs → text). @@ -14,7 +14,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { class Tokenizer { public: @@ -82,4 +82,4 @@ class Tokenizer { PreTokenizer pre_type_ = PreTokenizer::QWEN35; }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/tool_memory.cpp b/dflash/src/server/tool_memory.cpp index 75b1f63ed..1d54f11f7 100644 --- a/dflash/src/server/tool_memory.cpp +++ b/dflash/src/server/tool_memory.cpp @@ -4,7 +4,7 @@ #include -namespace dflash27b { +namespace dflash::common { ToolMemory::ToolMemory(size_t max_entries, size_t max_bytes) : max_entries_(max_entries) @@ -136,4 +136,4 @@ void ToolMemory::prune() { } } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/tool_memory.h b/dflash/src/server/tool_memory.h index c9167c270..0ff92ac0e 100644 --- a/dflash/src/server/tool_memory.h +++ b/dflash/src/server/tool_memory.h @@ -14,7 +14,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { class ToolMemory { public: @@ -55,4 +55,4 @@ class ToolMemory { std::unordered_map::iterator> lru_map_; }; -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/tool_parser.cpp b/dflash/src/server/tool_parser.cpp index 44da48a51..18fb74263 100644 --- a/dflash/src/server/tool_parser.cpp +++ b/dflash/src/server/tool_parser.cpp @@ -16,7 +16,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // ─── Helpers ──────────────────────────────────────────────────────────── @@ -465,4 +465,4 @@ ToolParseResult parse_tool_calls(const std::string & text, const json & tools) { return result; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/tool_parser.h b/dflash/src/server/tool_parser.h index 1429e6763..e891104bc 100644 --- a/dflash/src/server/tool_parser.h +++ b/dflash/src/server/tool_parser.h @@ -15,7 +15,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { using json = nlohmann::json; @@ -35,4 +35,4 @@ struct ToolParseResult { ToolParseResult parse_tool_calls(const std::string & text, const json & tools = json()); -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/src/server/utf8_utils.h b/dflash/src/server/utf8_utils.h index a40b04cc3..909107d51 100644 --- a/dflash/src/server/utf8_utils.h +++ b/dflash/src/server/utf8_utils.h @@ -9,7 +9,7 @@ #include #include -namespace dflash27b { +namespace dflash::common { // Snap a byte offset back to a UTF-8 code-point boundary. // Returns the largest position <= `pos` that doesn't split a multi-byte sequence. @@ -74,4 +74,4 @@ inline std::string utf8_sanitize(const std::string & s) { return out; } -} // namespace dflash27b +} // namespace dflash::common diff --git a/dflash/test/bench_laguna_generate.cpp b/dflash/test/bench_laguna_generate.cpp index 559db5695..cc068a163 100644 --- a/dflash/test/bench_laguna_generate.cpp +++ b/dflash/test/bench_laguna_generate.cpp @@ -24,7 +24,7 @@ #include "ggml-backend.h" #include "ggml-cuda.h" -using namespace dflash27b; +using namespace dflash::common; // Forward step lives in src/laguna_target_graph.cpp::laguna_step(). The // bench just times prefill + decode loops on top of it. diff --git a/dflash/test/bench_laguna_pflash.cpp b/dflash/test/bench_laguna_pflash.cpp index b4df05fbd..46b2311e0 100644 --- a/dflash/test/bench_laguna_pflash.cpp +++ b/dflash/test/bench_laguna_pflash.cpp @@ -27,7 +27,7 @@ #include "ggml-backend.h" #include "ggml-cuda.h" -using namespace dflash27b; +using namespace dflash::common; // Chunked prefill loop on top of the shared laguna_step() helper. Reports // total prefill time and the argmax / logit at the LAST chunk. diff --git a/dflash/test/bench_laguna_ttft.cpp b/dflash/test/bench_laguna_ttft.cpp index 8aec1613d..19be95c83 100644 --- a/dflash/test/bench_laguna_ttft.cpp +++ b/dflash/test/bench_laguna_ttft.cpp @@ -29,7 +29,7 @@ #include "ggml-backend.h" #include "ggml-cuda.h" -using namespace dflash27b; +using namespace dflash::common; static std::vector parse_csv(const std::string & s, std::vector dflt) { if (s.empty()) return dflt; diff --git a/dflash/test/pflash_daemon.cpp b/dflash/test/pflash_daemon.cpp index d19591e5c..38e291e8a 100644 --- a/dflash/test/pflash_daemon.cpp +++ b/dflash/test/pflash_daemon.cpp @@ -30,7 +30,7 @@ #include #endif -using namespace dflash27b; +using namespace dflash::common; static std::vector read_counted_i32_file(const std::string & path) { std::ifstream f(path, std::ios::binary); diff --git a/dflash/test/smoke_draft_graph.cpp b/dflash/test/smoke_draft_graph.cpp index b64cb7a36..544f8b51f 100644 --- a/dflash/test/smoke_draft_graph.cpp +++ b/dflash/test/smoke_draft_graph.cpp @@ -30,7 +30,7 @@ #include #include -using namespace dflash27b; +using namespace dflash::common; // Convert fp32 -> bf16 (truncation) static uint16_t f32_to_bf16(float f) { diff --git a/dflash/test/smoke_laguna_forward.cpp b/dflash/test/smoke_laguna_forward.cpp index d92c6fc35..dbf340194 100644 --- a/dflash/test/smoke_laguna_forward.cpp +++ b/dflash/test/smoke_laguna_forward.cpp @@ -22,7 +22,7 @@ #include "ggml-cuda.h" #include "ggml-alloc.h" -using namespace dflash27b; +using namespace dflash::common; int main(int argc, char ** argv) { if (argc < 2) { diff --git a/dflash/test/smoke_load_draft.cpp b/dflash/test/smoke_load_draft.cpp index 983e44231..9e0f37b42 100644 --- a/dflash/test/smoke_load_draft.cpp +++ b/dflash/test/smoke_load_draft.cpp @@ -17,7 +17,7 @@ #include #include -using namespace dflash27b; +using namespace dflash::common; int main(int argc, char ** argv) { if (argc < 2) { diff --git a/dflash/test/smoke_load_target.cpp b/dflash/test/smoke_load_target.cpp index dea06dbbc..595c13530 100644 --- a/dflash/test/smoke_load_target.cpp +++ b/dflash/test/smoke_load_target.cpp @@ -16,7 +16,7 @@ #include #include -using namespace dflash27b; +using namespace dflash::common; int main(int argc, char ** argv) { if (argc < 2) { diff --git a/dflash/test/smoke_load_target_laguna.cpp b/dflash/test/smoke_load_target_laguna.cpp index 6da68a4e0..c426cc1ee 100644 --- a/dflash/test/smoke_load_target_laguna.cpp +++ b/dflash/test/smoke_load_target_laguna.cpp @@ -15,7 +15,7 @@ #include "ggml-backend.h" #include "ggml-cuda.h" -using namespace dflash27b; +using namespace dflash::common; int main(int argc, char ** argv) { if (argc < 2) { diff --git a/dflash/test/smoke_qwen3_forward.cpp b/dflash/test/smoke_qwen3_forward.cpp index 7d46748e9..4efa41781 100644 --- a/dflash/test/smoke_qwen3_forward.cpp +++ b/dflash/test/smoke_qwen3_forward.cpp @@ -24,7 +24,7 @@ #include #include -using namespace dflash27b; +using namespace dflash::common; int main(int argc, char ** argv) { if (argc < 3) { diff --git a/dflash/test/smoke_target_forward.cpp b/dflash/test/smoke_target_forward.cpp index dcb61390d..285526ddb 100644 --- a/dflash/test/smoke_target_forward.cpp +++ b/dflash/test/smoke_target_forward.cpp @@ -25,7 +25,7 @@ #include #include -using namespace dflash27b; +using namespace dflash::common; int main(int argc, char ** argv) { if (argc < 2) { diff --git a/dflash/test/test_dflash.cpp b/dflash/test/test_dflash.cpp index 9dc38e8d4..32ed24e20 100644 --- a/dflash/test/test_dflash.cpp +++ b/dflash/test/test_dflash.cpp @@ -24,7 +24,7 @@ #include "qwen3_drafter.h" #include "gpu_runtime_compat.h" #include "laguna_daemon.h" // arch dispatch - laguna targets are served by - // dflash27b::run_laguna_daemon() instead of the + // dflash::common::run_laguna_daemon() instead of the // qwen35 + DFlash + DDTree pipeline below. #include "qwen35_daemon.h" // arch dispatch - single-GPU qwen35 daemon mode #include "qwen35_layer_split.h" // multi-GPU layer-split daemon args @@ -92,7 +92,7 @@ to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); #include #include -using namespace dflash27b; +using namespace dflash::common; static SamplerCfg g_sampler; static std::mt19937_64 g_sampler_rng{std::random_device{}()}; @@ -110,16 +110,16 @@ static std::mt19937_64 g_sampler_rng{std::random_device{}()}; // ─── Small utilities — extracted to src/common/io_utils.h ────────── #include "io_utils.h" -using dflash27b::read_int32_file; -using dflash27b::write_int32_file; -using dflash27b::stream_emit_fd; -using dflash27b::argmax_f32; -using dflash27b::write_binary_file; -using dflash27b::read_binary_file_exact; -using dflash27b::read_line_tail; +using dflash::common::read_int32_file; +using dflash::common::write_int32_file; +using dflash::common::stream_emit_fd; +using dflash::common::argmax_f32; +using dflash::common::write_binary_file; +using dflash::common::read_binary_file_exact; +using dflash::common::read_line_tail; #if !defined(_WIN32) -using dflash27b::read_exact_fd; -using dflash27b::write_exact_fd; +using dflash::common::read_exact_fd; +using dflash::common::write_exact_fd; #endif // CPU sampler chain (SamplerCfg / sample_logits / parse_sampler_token) lives @@ -134,12 +134,12 @@ using dflash27b::write_exact_fd; // The global `g_kq_stride_pad` below is set at init time and forwarded to // build_causal_mask / build_tree_mask (now in src/qwen35/attn_masks.h). #include "attn_masks.h" -using dflash27b::KQ_MASK_PAD; -using dflash27b::F16_ZERO; -using dflash27b::F16_NEG_INF; -using dflash27b::align_up; -using dflash27b::build_causal_mask; -using dflash27b::build_tree_mask; +using dflash::common::KQ_MASK_PAD; +using dflash::common::F16_ZERO; +using dflash::common::F16_NEG_INF; +using dflash::common::align_up; +using dflash::common::build_causal_mask; +using dflash::common::build_tree_mask; static int g_kq_stride_pad = KQ_MASK_PAD; // overridden to 256 when TBQ KV is active static int g_max_ctx_override = 0; // overridden by --max-ctx=N (default 4096) static int g_fa_window = 2048; // overridden by DFLASH27B_FA_WINDOW=N @@ -150,50 +150,50 @@ static int g_draft_ctx_max = 4096; // draft context cap; --draft-ctx-ma // Extracted to src/qwen35/ddtree.{h,cpp}. Provides DDTree struct, // extract_draft_topk(), build_ddtree(), follow_verified_tree(). #include "ddtree.h" -using dflash27b::DDTree; -using dflash27b::extract_draft_topk; -using dflash27b::build_ddtree; -using dflash27b::follow_verified_tree; +using dflash::common::DDTree; +using dflash::common::extract_draft_topk; +using dflash::common::build_ddtree; +using dflash::common::follow_verified_tree; // ─── StepGraph — extracted to src/qwen35/step_graph.h ── #include "step_graph.h" -using dflash27b::StepGraph; -using dflash27b::step_graph_free; -using dflash27b::step_graph_destroy; +using dflash::common::StepGraph; +using dflash::common::step_graph_free; +using dflash::common::step_graph_destroy; // ─── Peer access + DraftFeatureMirror — extracted to src/qwen35/ ── #include "peer_access.h" #include "dflash_feature_ring.h" -using dflash27b::g_peer_access_opt_in; -using dflash27b::g_peer_pair_ok_cache; -using dflash27b::enable_peer_access_one_way; -using dflash27b::enable_peer_access_pair; -using dflash27b::cross_device_peer_memcpy_ok; -using dflash27b::copy_peer_async; -using dflash27b::DraftFeatureMirror; -using dflash27b::draft_feature_mirror_free; -using dflash27b::draft_feature_mirror_init; -using dflash27b::draft_feature_mirror_can_view; -using dflash27b::draft_feature_mirror_sync_range; -using dflash27b::draft_feature_mirror_sync_tail; +using dflash::common::g_peer_access_opt_in; +using dflash::common::g_peer_pair_ok_cache; +using dflash::common::enable_peer_access_one_way; +using dflash::common::enable_peer_access_pair; +using dflash::common::cross_device_peer_memcpy_ok; +using dflash::common::copy_peer_async; +using dflash::common::DraftFeatureMirror; +using dflash::common::draft_feature_mirror_free; +using dflash::common::draft_feature_mirror_init; +using dflash::common::draft_feature_mirror_can_view; +using dflash::common::draft_feature_mirror_sync_range; +using dflash::common::draft_feature_mirror_sync_tail; // ─── Graph builders — extracted to src/qwen35/graph_builders.{h,cpp} ── #include "graph_builders.h" #include "dflash_draft_graph.h" -using dflash27b::build_layer_step; -using dflash27b::build_target_step; -using dflash27b::build_target_step_tree; -using dflash27b::build_draft_step; -using dflash27b::build_lm_head_projection_step; +using dflash::common::build_layer_step; +using dflash::common::build_target_step; +using dflash::common::build_target_step_tree; +using dflash::common::build_draft_step; +using dflash::common::build_lm_head_projection_step; // ─── Layer split types — extracted to src/qwen35/layer_split_types.h ── #include "layer_split_types.h" -using dflash27b::LayerSplitRuntimeConfig; -using dflash27b::TargetLayerSplitShard; -using dflash27b::ActivationPair; -using dflash27b::activation_pair_free; -using dflash27b::activation_pair_init; -using dflash27b::find_target_shard; +using dflash::common::LayerSplitRuntimeConfig; +using dflash::common::TargetLayerSplitShard; +using dflash::common::ActivationPair; +using dflash::common::activation_pair_free; +using dflash::common::activation_pair_init; +using dflash::common::find_target_shard; static bool parse_int_list(const char * text, std::vector & out) { out.clear(); @@ -229,39 +229,39 @@ static bool parse_float_list(const char * text, std::vector & out) { // ─── Draft IPC — extracted to src/qwen35/draft_ipc.{h,cpp} ── #include "dflash_draft_ipc.h" -using dflash27b::DFlashDraftIpcClient; -using dflash27b::copy_capture_slice_to_remote_draft; -using dflash27b::stream_status; -using dflash27b::run_dflash_draft_ipc_daemon; +using dflash::common::DFlashDraftIpcClient; +using dflash::common::copy_capture_slice_to_remote_draft; +using dflash::common::stream_status; +using dflash::common::run_dflash_draft_ipc_daemon; // ─── GGUF inspection — extracted to src/common/gguf_inspect.{h,cpp} ── #include "gguf_inspect.h" // ─── Layer ranges — extracted to src/common/layer_split_utils.{h,cpp} ── #include "layer_split_utils.h" -using dflash27b::compute_layer_ranges; +using dflash::common::compute_layer_ranges; // ─── Feature copy helpers — extracted to src/qwen35/feature_copy.{h,cpp} ── #include "dflash_capture.h" -using dflash27b::target_capture_index; -using dflash27b::copy_capture_slice_to_draft_ring; -using dflash27b::copy_feature_ring_range_to_tensor; +using dflash::common::target_capture_index; +using dflash::common::copy_capture_slice_to_draft_ring; +using dflash::common::copy_feature_ring_range_to_tensor; // ─── Layer-split forward — extracted to src/qwen35/layer_split_forward.{h,cpp} ── #include "layer_split_forward.h" -using dflash27b::compute_target_split_argmax; -using dflash27b::run_target_layer_split_forward; -using dflash27b::free_target_layer_split_shards; +using dflash::common::compute_target_split_argmax; +using dflash::common::run_target_layer_split_forward; +using dflash::common::free_target_layer_split_shards; // ─── Speculative decode — generic loop in common/, qwen35 layer-split adapter. #include "qwen35_layer_split_dflash_target.h" #include "common/dflash_spec_decode.h" -using dflash27b::is_eos_tok; +using dflash::common::is_eos_tok; // ─── Layer-split daemon — extracted to src/qwen35/layer_split_daemon.{h,cpp} ─ #include "layer_split_daemon.h" -using dflash27b::run_target_layer_split_request; +using dflash::common::run_target_layer_split_request; static int run_target_layer_split_daemon( const char * target_path, @@ -318,7 +318,7 @@ static int run_target_layer_split_harness( std::fprintf(stderr, "target layer split requires prompt/n_gen/out positional args\n"); return 2; } - const int n_layer = dflash27b::inspect_gguf_model_info(target_path).n_layer; + const int n_layer = dflash::common::inspect_gguf_model_info(target_path).n_layer; if (n_layer <= 0) { std::fprintf(stderr, "target-split could not read qwen35.block_count\n"); return 1; @@ -712,7 +712,7 @@ int main(int argc, char ** argv) { // shape so we can route laguna requests to run_laguna_daemon() and // accept the no-draft argv layout server.py uses for that arch. #include "gguf_inspect.h" - const auto model_info = dflash27b::inspect_gguf_model_info(target_path); + const auto model_info = dflash::common::inspect_gguf_model_info(target_path); const std::string detected_arch = model_info.arch; const bool is_laguna = (detected_arch == "laguna"); const bool is_qwen3 = (detected_arch == "qwen3"); @@ -972,13 +972,13 @@ int main(int argc, char ** argv) { "[test_dflash] arch=laguna -> dispatching to run_laguna_daemon " "(max_ctx=%d kv=%s chunk=%d stream_fd=%d). DFlash + DDTree disabled.\n", max_ctx_eff, ggml_type_name(kv), chunk, stream_fd); - dflash27b::LagunaDaemonArgs largs; + dflash::common::LagunaDaemonArgs largs; largs.target_path = target_path; largs.device.max_ctx = max_ctx_eff; largs.chunk = chunk; largs.kv_type = kv; largs.stream_fd = stream_fd; - return dflash27b::run_laguna_daemon(largs); + return dflash::common::run_laguna_daemon(largs); } // ---- Arch dispatch: qwen3 targets to the dedicated daemon ----- @@ -987,13 +987,13 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[test_dflash] arch=qwen3 -> dispatching to run_qwen3_daemon " "(max_ctx=%d stream_fd=%d)\n", max_ctx_eff, stream_fd); - dflash27b::Qwen3DaemonArgs q3args; + dflash::common::Qwen3DaemonArgs q3args; q3args.model_path = target_path; q3args.device.gpu = target_gpu; q3args.device.max_ctx = max_ctx_eff; q3args.stream_fd = stream_fd; q3args.chunk = 512; - return dflash27b::run_qwen3_daemon(q3args); + return dflash::common::run_qwen3_daemon(q3args); } // ---- Arch dispatch: gemma4 targets to the dedicated daemon ----- @@ -1002,13 +1002,13 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[test_dflash] arch=gemma4 -> dispatching to run_gemma4_daemon " "(max_ctx=%d stream_fd=%d)\n", max_ctx_eff, stream_fd); - dflash27b::Gemma4DaemonArgs g4args; + dflash::common::Gemma4DaemonArgs g4args; g4args.model_path = target_path; g4args.device.gpu = target_gpu; g4args.device.max_ctx = max_ctx_eff; g4args.stream_fd = stream_fd; g4args.chunk = 512; - return dflash27b::run_gemma4_daemon(g4args); + return dflash::common::run_gemma4_daemon(g4args); } // Helper: write a committed token to the stream fd immediately (int32 LE). @@ -1073,7 +1073,7 @@ int main(int argc, char ** argv) { return 2; } if (daemon_mode) { - dflash27b::Qwen35LayerSplitDaemonArgs lsargs; + dflash::common::Qwen35LayerSplitDaemonArgs lsargs; lsargs.target_path = target_path; lsargs.draft_path = draft_path; lsargs.device.layer_split_gpus = target_gpus; @@ -1127,7 +1127,7 @@ int main(int argc, char ** argv) { // loop remains for one-shot, test-window, and profile-scaling modes. if (daemon_mode && target_gpus.size() <= 1) { const int max_ctx_eff = g_max_ctx_override > 0 ? g_max_ctx_override : 4096; - dflash27b::Qwen35DaemonArgs qargs; + dflash::common::Qwen35DaemonArgs qargs; qargs.target_path = target_path; qargs.draft_path = draft_path; qargs.device.gpu = target_gpu; @@ -1149,7 +1149,7 @@ int main(int argc, char ** argv) { std::fprintf(stderr, "[test_dflash] arch=qwen35 daemon -> dispatching to run_qwen35_daemon " "(max_ctx=%d stream_fd=%d)\n", max_ctx_eff, stream_fd); - return dflash27b::run_qwen35_daemon(qargs); + return dflash::common::run_qwen35_daemon(qargs); } const bool split_gpus = target_gpu != draft_gpu; @@ -1509,7 +1509,7 @@ int main(int argc, char ** argv) { bool target_parked = false; bool draft_parked = false; // pflash drafter (lazy-loaded on first `compress` command) - dflash27b::DrafterContext drafter_ctx; + dflash::common::DrafterContext drafter_ctx; bool drafter_loaded = false; while (true) { @@ -1558,7 +1558,7 @@ int main(int argc, char ** argv) { } if (line == "free drafter" || line == "drafter free") { if (drafter_loaded) { - dflash27b::free_drafter(drafter_ctx); + dflash::common::free_drafter(drafter_ctx); drafter_loaded = false; std::printf("[drafter] freed\n"); std::fflush(stdout); } @@ -1618,8 +1618,8 @@ int main(int argc, char ** argv) { "[compress] bad args, need: [drafter_arch]\n"); stream_emit(-1); continue; } - dflash27b::DrafterArch drafter_arch; - if (!dflash27b::parse_drafter_arch(arch_name, drafter_arch)) { + dflash::common::DrafterArch drafter_arch; + if (!dflash::common::parse_drafter_arch(arch_name, drafter_arch)) { std::fprintf(stderr, "[compress] bad drafter_arch: %s\n", arch_name); stream_emit(-1); continue; } @@ -1651,30 +1651,30 @@ int main(int argc, char ** argv) { } if (!drafter_loaded) { - if (!dflash27b::load_drafter(drafter_path, /*gpu_layers=*/999, drafter_arch, drafter_ctx)) { + if (!dflash::common::load_drafter(drafter_path, /*gpu_layers=*/999, drafter_arch, drafter_ctx)) { std::fprintf(stderr, "[compress] load_drafter failed: %s\n", dflash27b_last_error()); stream_emit(-1); continue; } drafter_loaded = true; - if (drafter_arch == dflash27b::DrafterArch::Qwen3_0p6b) { + if (drafter_arch == dflash::common::DrafterArch::Qwen3_0p6b) { std::printf("[drafter] loaded %s arch=%s (n_layer=%d n_head=%d n_head_kv=%d)\n", - drafter_path, dflash27b::drafter_arch_name(drafter_arch), drafter_ctx.weights.n_layer, + drafter_path, dflash::common::drafter_arch_name(drafter_arch), drafter_ctx.weights.n_layer, drafter_ctx.weights.n_head, drafter_ctx.weights.n_head_kv); } else { std::printf("[drafter] loaded %s arch=%s\n", - drafter_path, dflash27b::drafter_arch_name(drafter_arch)); + drafter_path, dflash::common::drafter_arch_name(drafter_arch)); } std::fflush(stdout); } else if (drafter_ctx.arch != drafter_arch) { std::fprintf(stderr, "[compress] requested arch=%s but loaded arch=%s\n", - dflash27b::drafter_arch_name(drafter_arch), - dflash27b::drafter_arch_name(drafter_ctx.arch)); + dflash::common::drafter_arch_name(drafter_arch), + dflash::common::drafter_arch_name(drafter_ctx.arch)); stream_emit(-1); continue; } float keep = (float)keep_x1000 / 1000.0f; - auto compressed = dflash27b::drafter_score_and_compress( + auto compressed = dflash::common::drafter_score_and_compress( drafter_ctx, src_ids, keep); std::printf("[compress] %zu -> %zu tokens (keep_ratio=%.3f)\n", src_ids.size(), compressed.size(), keep); diff --git a/dflash/test/test_flashprefill_kernels.cpp b/dflash/test/test_flashprefill_kernels.cpp index 357d79628..7b3927269 100644 --- a/dflash/test/test_flashprefill_kernels.cpp +++ b/dflash/test/test_flashprefill_kernels.cpp @@ -248,7 +248,7 @@ int main() { CK(cudaMemcpy(bdK, bK.data(), bK.size() * sizeof(__nv_bfloat16), cudaMemcpyHostToDevice)); CK(cudaMemcpy(bdV, bV.data(), bV.size() * sizeof(__nv_bfloat16), cudaMemcpyHostToDevice)); - dflash27b::flashprefill::FlashPrefillConfig cfg; + dflash::common::flashprefill::FlashPrefillConfig cfg; cfg.block_size = BL; cfg.attention_sink = 2; cfg.window = 4; @@ -256,7 +256,7 @@ int main() { cfg.alpha = 0.12f; // Warm-up - dflash27b::flashprefill::flash_prefill_forward_bf16( + dflash::common::flashprefill::flash_prefill_forward_bf16( bdQ, bdK, bdV, bdO, BB, BS, BH, BHk, BD, 1.0f / std::sqrt((float)BD), cfg); CK(cudaDeviceSynchronize()); @@ -266,7 +266,7 @@ int main() { cudaEventCreate(&e_b); cudaEventRecord(e_a); for (int it = 0; it < 5; ++it) { - dflash27b::flashprefill::flash_prefill_forward_bf16( + dflash::common::flashprefill::flash_prefill_forward_bf16( bdQ, bdK, bdV, bdO, BB, BS, BH, BHk, BD, 1.0f / std::sqrt((float)BD), cfg); } diff --git a/dflash/test/test_generate.cpp b/dflash/test/test_generate.cpp index 0de937a42..0854a7e00 100644 --- a/dflash/test/test_generate.cpp +++ b/dflash/test/test_generate.cpp @@ -46,7 +46,7 @@ #include #endif -using namespace dflash27b; +using namespace dflash::common; struct StepGraph { ggml_context * ctx = nullptr; diff --git a/dflash/test/test_laguna_daemon.cpp b/dflash/test/test_laguna_daemon.cpp index dd1540581..ab6470447 100644 --- a/dflash/test/test_laguna_daemon.cpp +++ b/dflash/test/test_laguna_daemon.cpp @@ -1,4 +1,4 @@ -// Thin wrapper around dflash27b::run_laguna_daemon(). +// Thin wrapper around dflash::common::run_laguna_daemon(). // // Kept as a separate binary so scripts/laguna_pflash_niah.py can spawn the // laguna daemon directly without going through test_dflash. The actual @@ -28,7 +28,7 @@ int main(int argc, char ** argv) { return 1; } - dflash27b::LagunaDaemonArgs args; + dflash::common::LagunaDaemonArgs args; args.target_path = argv[1]; auto need_arg = [&](int i) { @@ -60,5 +60,5 @@ int main(int argc, char ** argv) { } } - return dflash27b::run_laguna_daemon(args); + return dflash::common::run_laguna_daemon(args); } diff --git a/dflash/test/test_restore_delta.cpp b/dflash/test/test_restore_delta.cpp index 3577ada51..2b69f610a 100644 --- a/dflash/test/test_restore_delta.cpp +++ b/dflash/test/test_restore_delta.cpp @@ -14,7 +14,7 @@ static void check(bool ok, const char * msg) { } int main() { - using dflash27b::restore_prompt_delta; + using dflash::common::restore_prompt_delta; // Regression for #216: RESTORE receives the full prompt, but the backend // must prefill only the suffix that was not covered by the cached snapshot. diff --git a/dflash/test/test_server_unit.cpp b/dflash/test/test_server_unit.cpp index 4c229d92f..9238873d3 100644 --- a/dflash/test/test_server_unit.cpp +++ b/dflash/test/test_server_unit.cpp @@ -28,7 +28,7 @@ #include using json = nlohmann::json; -using namespace dflash27b; +using namespace dflash::common; // ─── Test framework (ds4 style) ──────────────────────────────────────── diff --git a/dflash/test/test_tokenizer_harness.cpp b/dflash/test/test_tokenizer_harness.cpp index 6aea14bec..666dbe8be 100644 --- a/dflash/test/test_tokenizer_harness.cpp +++ b/dflash/test/test_tokenizer_harness.cpp @@ -20,7 +20,7 @@ #include using json = nlohmann::json; -using namespace dflash27b; +using namespace dflash::common; int main(int argc, char ** argv) { if (argc < 2) { diff --git a/dflash/test/test_vs_oracle.cpp b/dflash/test/test_vs_oracle.cpp index 22f547b15..7d1ef3437 100644 --- a/dflash/test/test_vs_oracle.cpp +++ b/dflash/test/test_vs_oracle.cpp @@ -26,7 +26,7 @@ #include #include -using namespace dflash27b; +using namespace dflash::common; struct OracleMeta { int ctx_len = 0;