Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions dflash/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,27 @@ if(DFLASH27B_TESTS)
endif()
endif()

if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/ipc/dflash_draft_ipc_main.cpp")
add_executable(dflash_draft_ipc_daemon
src/ipc/dflash_draft_ipc_main.cpp
)
target_include_directories(dflash_draft_ipc_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
target_compile_definitions(dflash_draft_ipc_daemon PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
else()
target_compile_definitions(dflash_draft_ipc_daemon PRIVATE
DFLASH27B_BACKEND_CUDA=1
DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
endif()
target_link_libraries(dflash_draft_ipc_daemon PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread)
if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
find_package(CUDAToolkit REQUIRED)
target_link_libraries(dflash_draft_ipc_daemon PRIVATE CUDA::cudart)
else()
target_link_libraries(dflash_draft_ipc_daemon PRIVATE hip::host)
endif()
endif()

# Tokenizer test harness (no GPU needed — links static lib for tokenizer + GGUF reader)
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_tokenizer_harness.cpp")
add_executable(test_tokenizer_harness test/test_tokenizer_harness.cpp)
Expand Down
5 changes: 5 additions & 0 deletions dflash/src/common/backend_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ std::string detect_arch(const char * model_path) {
return info.arch;
}

bool arch_supports_remote_draft(const std::string & arch) {
return arch == "qwen35";
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why gemma4 cannot support this?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll make it supported from another PR after gemma4 and other model backend feature fixed and merged into main.

}

std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args) {
if (!args.model_path) {
std::fprintf(stderr, "[backend_factory] model_path is null\n");
Expand All @@ -38,6 +42,7 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args) {
cfg.draft_path = args.draft_path;
cfg.device = args.device;
cfg.draft_gpu = args.draft_device.gpu;
cfg.remote_draft = args.remote_draft;
cfg.stream_fd = args.stream_fd;
cfg.fa_window = args.fa_window;
cfg.kq_stride_pad = args.kq_stride_pad;
Expand Down
4 changes: 4 additions & 0 deletions dflash/src/common/backend_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "model_backend.h"
#include "placement/placement_config.h"
#include "placement/remote_draft_config.h"

#include <memory>
#include <string>
Expand All @@ -31,6 +32,7 @@ struct BackendArgs {
// Device placement
DevicePlacement device;
DevicePlacement draft_device;
RemoteDraftConfig remote_draft;

// I/O — only used when running under daemon_loop (legacy). The new
// server passes -1 and uses on_token callbacks instead.
Expand Down Expand Up @@ -62,4 +64,6 @@ std::unique_ptr<ModelBackend> create_backend(const BackendArgs & args);
// Useful for early dispatch (e.g. printing which backend will be used).
std::string detect_arch(const char * model_path);

bool arch_supports_remote_draft(const std::string & arch);

} // namespace dflash::common
6 changes: 6 additions & 0 deletions dflash/src/common/model_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,12 @@ struct ModelBackend {
// growth over time. Default is a no-op.
virtual void release_scratch() {}

// Return true when the backend can route draft execution through the
// common remote-draft IPC transport. Model families that do not implement
// the DFlash feature boundary keep the default false and are rejected by
// the server before startup.
virtual bool supports_remote_draft() const { return false; }

// ── Cleanup ──────────────────────────────────────────────────────
// Release all resources (weights, cache, snapshots, drafter).
// Called by run_daemon() before returning.
Expand Down
45 changes: 45 additions & 0 deletions dflash/src/ipc/dflash_draft_ipc_main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// Standalone DFlash draft IPC daemon entry point.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not proper place for this binary main. put it to src/ipc folder.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree, let me move it


#include "dflash_draft_ipc.h"

#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <cstring>

using namespace dflash::common;

int main(int argc, char ** argv) {
if (argc < 3 || std::strcmp(argv[1], "--draft-ipc-daemon") != 0) {
std::fprintf(stderr,
"usage: %s --draft-ipc-daemon <draft.safetensors|draft.gguf> "
"--ring-cap=N --stream-fd=FD [--draft-gpu=N]\n",
argv[0]);
return 2;
}

const char * draft_path = argv[2];
int ring_cap = 4096;
int draft_gpu = 0;
int stream_fd = -1;
for (int i = 3; i < argc; i++) {
if (std::strncmp(argv[i], "--ring-cap=", 11) == 0) {
ring_cap = std::atoi(argv[i] + 11);
} else if (std::strcmp(argv[i], "--ring-cap") == 0) {
if (i + 1 < argc) ring_cap = std::atoi(argv[++i]);
} else if (std::strncmp(argv[i], "--draft-gpu=", 12) == 0) {
draft_gpu = std::max(0, std::atoi(argv[i] + 12));
} else if (std::strcmp(argv[i], "--draft-gpu") == 0) {
if (i + 1 < argc) draft_gpu = std::max(0, std::atoi(argv[++i]));
} else if (std::strncmp(argv[i], "--stream-fd=", 12) == 0) {
stream_fd = std::atoi(argv[i] + 12);
} else if (std::strcmp(argv[i], "--stream-fd") == 0) {
if (i + 1 < argc) stream_fd = std::atoi(argv[++i]);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why no use mmap? stream fd seems slow to send big chunk of data.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The stream fd is only used for small control/status messages, not for the large feature payload. The current first-pass transport writes feature/noise tensors to temporary files and sends paths over the control channel.

But yes I agree mmap/shared memory is the better next step for reducing host-copy and filesystem overhead, let me mark it to the follow-up vram optimization plan.

} else {
std::fprintf(stderr, "[draft-ipc-daemon] unknown option: %s\n", argv[i]);
return 2;
}
}

return run_dflash_draft_ipc_daemon(draft_path, ring_cap, draft_gpu, stream_fd);
}
18 changes: 18 additions & 0 deletions dflash/src/placement/remote_draft_config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Remote draft execution configuration for mixed-backend target/draft placement.

#pragma once

#include <string>

namespace dflash::common {

struct RemoteDraftConfig {
std::string ipc_bin;
std::string work_dir;
int ring_cap = 0;

bool enabled() const { return !ipc_bin.empty(); }
bool has_aux_options() const { return !work_dir.empty() || ring_cap > 0; }
};

} // namespace dflash::common
Loading
Loading