Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 93 additions & 93 deletions dflash/CMakeLists.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion dflash/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ Full `bench_llm.py` suite on Qwen3.6-27B UD-Q4_K_XL, 10 prompts, n_gen=256, RTX
and dispatches by arch:

- `qwen35` / `qwen36` → existing DFlash + DDTree pipeline (no change).
- `laguna` → `dflash27b::run_laguna_daemon()` (no spec-decode, no DDTree).
- `laguna` → `dflash::common::run_laguna_daemon()` (no spec-decode, no DDTree).

The daemon stdin/stream-fd protocol is identical, so `scripts/server.py`
drives both arches end-to-end. The only thing the user changes is `--target`.
Expand Down
2 changes: 1 addition & 1 deletion dflash/scripts/convert_dflash_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
name.

Usage:
PYTHONPATH=../../dflash27b_ggml/deps/llama.cpp/gguf-py python convert_dflash_to_gguf.py \
PYTHONPATH=../../dflash_ggml/deps/llama.cpp/gguf-py python convert_dflash_to_gguf.py \
models/draft/model.safetensors \
qwen3.5-27b-dflash-draft.gguf
"""
Expand Down
4 changes: 2 additions & 2 deletions dflash/scripts/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def _extra_daemon_has_target_sharding(extra: list[str] | None) -> bool:
# Architecture strings stored in `general.architecture` of every GGUF this
# server can drive. test_dflash dispatches by GGUF arch internally:
# qwen35 / qwen36 -> existing DFlash + DDTree pipeline
# laguna -> dflash27b::run_laguna_daemon() (no spec-decode)
# laguna -> dflash::common::run_laguna_daemon() (no spec-decode)
# server.py just needs to omit --draft + the DFlash/DDTree flags when the
# arch doesn't support speculative decoding yet.
_QWEN35_ARCHES = {"qwen35", "qwen36"}
Expand Down Expand Up @@ -843,7 +843,7 @@ async def _openai_compat_error_handler(_request: Request, exc: OpenAICompatError

if arch in _LAGUNA_ARCHES:
# test_dflash detects arch=laguna from the GGUF and dispatches
# internally to dflash27b::run_laguna_daemon(). No --draft, no
# internally to dflash::common::run_laguna_daemon(). No --draft, no
# --fast-rollback, no --ddtree (no Laguna spec-decode draft yet).
# Tokens stream as int32 LE on stream_fd terminated by -1, byte-
# identical to the qwen35 path so SSE/stream consumers stay shared.
Expand Down
4 changes: 2 additions & 2 deletions dflash/src/bsa_launcher.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ template<typename T, int Headdim, bool Is_causal>
void run_mha_fwd_block_(Flash_fwd_params &params, cudaStream_t stream);
}

namespace dflash27b {
namespace dflash::common {
namespace flashprefill {

namespace {
Expand Down Expand Up @@ -275,4 +275,4 @@ fail:
}

} // namespace flashprefill
} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/bsa_launcher_hip.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include <cstdint>
#include <cstdlib> // size_t

namespace dflash27b {
namespace dflash::common {
namespace flashprefill {

// Defined in flashprefill_kernels.hip.cu.
Expand Down Expand Up @@ -108,4 +108,4 @@ extern "C" int launch_bsa_sparse_flash_forward_bf16(
}

} // namespace flashprefill
} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/attn_masks.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <cstdint>
#include <vector>

namespace dflash27b {
namespace dflash::common {

// Minimum alignment required by ggml flash_attn_ext for mask rows.
static constexpr int KQ_MASK_PAD = 32;
Expand Down Expand Up @@ -75,4 +75,4 @@ inline void build_tree_mask(const DDTree & tree, int past_length,
}
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/backend_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

#include <cstdio>

namespace dflash27b {
namespace dflash::common {

std::string detect_arch(const char * model_path) {
auto info = inspect_gguf_model_info(model_path);
Expand Down Expand Up @@ -107,4 +107,4 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args) {
}
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/backend_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#include <memory>
#include <string>

namespace dflash27b {
namespace dflash::common {

// ─── Backend creation arguments ─────────────────────────────────────────
// A superset of all per-arch config fields. The factory reads only those
Expand Down Expand Up @@ -62,4 +62,4 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args);
// Useful for early dispatch (e.g. printing which backend will be used).
std::string detect_arch(const char * model_path);

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/daemon_loop.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#define ssize_t long
#endif

namespace dflash27b {
namespace dflash::common {

// ── DaemonIO ────────────────────────────────────────────────────────────

Expand Down Expand Up @@ -424,4 +424,4 @@ int run_daemon(ModelBackend & backend, const DaemonLoopArgs & args) {
return 0;
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/daemon_loop.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

#include "model_backend.h"

namespace dflash27b {
namespace dflash::common {

struct DaemonLoopArgs {
int stream_fd = -1;
Expand All @@ -23,4 +23,4 @@ struct DaemonLoopArgs {
// commands until `quit`, `exit`, or EOF. Returns 0 on clean shutdown.
int run_daemon(ModelBackend & backend, const DaemonLoopArgs & args);

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/ddtree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include <cmath>
#include <vector>

namespace dflash27b {
namespace dflash::common {

void extract_draft_topk(const float * logits,
int n_positions, int vocab, int K,
Expand Down Expand Up @@ -223,4 +223,4 @@ std::vector<int> follow_verified_tree(const DDTree & tree,
return accepted;
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/ddtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include <unordered_map>
#include <vector>

namespace dflash27b {
namespace dflash::common {

// A flat DFS-ordered tree built from the draft's top-K softmax distributions.
// Slot 0 is the tree root (the bonus token from the previous spec round);
Expand Down Expand Up @@ -61,4 +61,4 @@ std::vector<int> follow_verified_tree(const DDTree & tree,
int & out_next_token,
int * out_node_idx = nullptr);

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/device_placement.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

#include <vector>

namespace dflash27b {
namespace dflash::common {

struct DevicePlacement {
int gpu = 0; // primary GPU (single-GPU mode)
Expand All @@ -28,4 +28,4 @@ struct DevicePlacement {
}
};

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_capture.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "dflash_capture.h"

namespace dflash27b {
namespace dflash::common {

int target_capture_index(const int * capture_layer_ids,
int n_capture_layers,
Expand All @@ -12,4 +12,4 @@ int target_capture_index(const int * capture_layer_ids,
return -1;
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_capture.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@

#pragma once

namespace dflash27b {
namespace dflash::common {

// Linear search for layer_idx in capture_layer_ids[0..n_capture_layers).
// Returns the capture index (0..n_capture_layers-1) on hit, -1 on miss.
int target_capture_index(const int * capture_layer_ids,
int n_capture_layers,
int layer_idx);

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_draft_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

#include <cstdio>

namespace dflash27b {
namespace dflash::common {

// Build draft graph at a given ctx_len into sg. Does NOT touch sg.alloc.
// mirror_view: if true, uses a view into mirror->target_feat at slot0.
Expand Down Expand Up @@ -128,4 +128,4 @@ bool build_draft_step(
return ggml_gallocr_alloc_graph(sg.alloc, sg.gf);
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_draft_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "ggml.h"
#include "ggml-backend.h"

namespace dflash27b {
namespace dflash::common {

// Draft forward: speculative next-token prediction using target features.
// lm_head: optional target lm_head tensor for fused projection. When
Expand All @@ -33,4 +33,4 @@ bool build_draft_step(
int committed = 0,
int ctx_len_max = 0);

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_draft_ipc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include <string>
#include <vector>

namespace dflash27b {
namespace dflash::common {

// ── DFlashDraftIpcClient ────────────────────────────────────────────

Expand Down Expand Up @@ -248,4 +248,4 @@ bool copy_capture_slice_to_remote_draft(
return remote.send_feature_slice(capture_idx, start_pos, n_tokens, host);
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_draft_ipc.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
# include <unistd.h>
#endif

namespace dflash27b {
namespace dflash::common {

// ── IPC Client (parent process) ─────────────────────────────────────

Expand Down Expand Up @@ -120,4 +120,4 @@ int run_dflash_draft_ipc_daemon(const char * draft_path,
int draft_gpu,
int stream_fd);

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_draft_ipc_daemon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#include <string>
#include <vector>

namespace dflash27b {
namespace dflash::common {

int run_dflash_draft_ipc_daemon(const char * draft_path,
int ring_cap,
Expand Down Expand Up @@ -208,4 +208,4 @@ int run_dflash_draft_ipc_daemon(const char * draft_path,
#endif
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_feature_ring.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ extern "C++" to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type);

#include "gpu_runtime_compat.h"

namespace dflash27b {
namespace dflash::common {

// ── internal helpers ────────────────────────────────────────────

Expand Down Expand Up @@ -236,4 +236,4 @@ bool copy_feature_ring_range_to_tensor(
return cudaDeviceSynchronize() == cudaSuccess;
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_feature_ring.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include <cstddef>
#include <cstdint>

namespace dflash27b {
namespace dflash::common {

struct DraftFeatureMirror {
ggml_context * ctx = nullptr;
Expand Down Expand Up @@ -88,4 +88,4 @@ bool copy_feature_ring_range_to_tensor(
int start_pos,
int n_tokens);

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_layer_split_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#include "ggml-alloc.h"
#include "ggml-backend.h"

namespace dflash27b {
namespace dflash::common {

// ── Runtime configuration (replaces globals) ────────────────────────

Expand Down Expand Up @@ -74,4 +74,4 @@ inline bool activation_pair_init(ActivationPair & p,
return true;
}

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_spec_decode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <cstdio>
#include <vector>

namespace dflash27b {
namespace dflash::common {

namespace {
// RAII guard so any early `return false` path frees the per-call draft graph.
Expand Down Expand Up @@ -205,5 +205,5 @@ bool run_dflash_spec_decode(
return true;
}

} // namespace dflash27b
} // namespace dflash::common

4 changes: 2 additions & 2 deletions dflash/src/common/dflash_spec_decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include <cstdint>
#include <vector>

namespace dflash27b {
namespace dflash::common {

struct DraftWeights; // forward-decl from internal.h

Expand Down Expand Up @@ -50,4 +50,4 @@ bool run_dflash_spec_decode(
int stream_fd = -1,
DFlashDraftIpcClient * remote_draft = nullptr);

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/dflash_target.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include <cstdint>
#include <vector>

namespace dflash27b {
namespace dflash::common {

struct DFlashTarget {
virtual ~DFlashTarget() = default;
Expand Down Expand Up @@ -74,4 +74,4 @@ struct DFlashTarget {
virtual const std::vector<int> & capture_layer_ids() const = 0;
};

} // namespace dflash27b
} // namespace dflash::common
4 changes: 2 additions & 2 deletions dflash/src/common/gguf_inspect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#include <cstring>
#include <string>

namespace dflash27b {
namespace dflash::common {

GgufModelInfo inspect_gguf_model_info(const char * path) {
GgufModelInfo info;
Expand Down Expand Up @@ -36,4 +36,4 @@ GgufModelInfo inspect_gguf_model_info(const char * path) {
return info;
}

} // namespace dflash27b
} // namespace dflash::common
Loading
Loading