Skip to content

Commit 070ff4d

Browse files
authored
mtmd: add --image-min/max-tokens (#16921)
1 parent bf7b0c9 commit 070ff4d

File tree

8 files changed

+79
-22
lines changed

8 files changed

+79
-22
lines changed

common/arg.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2768,6 +2768,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
27682768
params.image.emplace_back(value);
27692769
}
27702770
).set_examples({LLAMA_EXAMPLE_MTMD}));
2771+
add_opt(common_arg(
2772+
{"--image-min-tokens"}, "N",
2773+
"minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
2774+
[](common_params & params, int value) {
2775+
params.image_min_tokens = value;
2776+
}
2777+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MIN_TOKENS"));
2778+
add_opt(common_arg(
2779+
{"--image-max-tokens"}, "N",
2780+
"maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)",
2781+
[](common_params & params, int value) {
2782+
params.image_max_tokens = value;
2783+
}
2784+
).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS"));
27712785
if (llama_supports_rpc()) {
27722786
add_opt(common_arg(
27732787
{"--rpc"}, "SERVERS",

common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,8 @@ struct common_params {
406406
bool mmproj_use_gpu = true; // use GPU for multimodal model
407407
bool no_mmproj = false; // explicitly disable multimodal model
408408
std::vector<std::string> image; // path to image file(s)
409+
int image_min_tokens = -1;
410+
int image_max_tokens = -1;
409411

410412
// finetune
411413
struct lr_opt lr;

tools/mtmd/clip.cpp

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ struct clip_hparams {
169169
int32_t n_layer;
170170
// idefics3
171171
int32_t image_longest_edge = 0;
172-
int32_t image_min_pixels = 0;
173-
int32_t image_max_pixels = 0;
172+
int32_t image_min_pixels = -1;
173+
int32_t image_max_pixels = -1;
174174
int32_t n_merge = 0; // number of patch merges **per-side**
175175

176176
float image_mean[3];
@@ -203,11 +203,15 @@ struct clip_hparams {
203203
int minicpmv_version = 0;
204204
int32_t minicpmv_query_num = 0; // MiniCPM-V query number
205205

206+
// custom value provided by user, can be undefined if not set
207+
int32_t custom_image_min_tokens = -1;
208+
int32_t custom_image_max_tokens = -1;
209+
206210
void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) {
207211
const int cur_merge = n_merge == 0 ? 1 : n_merge;
208212
const int patch_area = patch_size * patch_size * cur_merge * cur_merge;
209-
image_min_pixels = n_tokens_min * patch_area;
210-
image_max_pixels = n_tokens_max * patch_area;
213+
image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area;
214+
image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area;
211215
warmup_image_size = static_cast<int>(std::sqrt(image_max_pixels));
212216
}
213217

@@ -216,6 +220,7 @@ struct clip_hparams {
216220
GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n");
217221
const int cur_merge = n_merge == 0 ? 1 : n_merge;
218222
warmup_image_size = n_tok_per_side * patch_size * cur_merge;
223+
// TODO: support warmup size for custom token numbers
219224
}
220225
};
221226

@@ -459,6 +464,13 @@ struct clip_ctx {
459464
LOG_INF("%s: CLIP using CPU backend\n", __func__);
460465
}
461466

467+
if (ctx_params.image_min_tokens > 0) {
468+
model.hparams.custom_image_min_tokens = ctx_params.image_min_tokens;
469+
}
470+
if (ctx_params.image_max_tokens > 0) {
471+
model.hparams.custom_image_max_tokens = ctx_params.image_max_tokens;
472+
}
473+
462474
backend_ptrs.push_back(backend_cpu);
463475
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
464476

@@ -2786,6 +2798,12 @@ struct clip_model_loader {
27862798
// see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858
27872799
hparams.set_limit_image_tokens(8, 2048);
27882800
hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
2801+
const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
2802+
if (hparams.image_min_pixels < warn_min_pixels) {
2803+
LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
2804+
LOG_WRN("%s: if you encounter problems with accuracy, try adding --image-min-tokens 1024\n", __func__);
2805+
LOG_WRN("%s: more info: https://github.com/ggml-org/llama.cpp/issues/16842\n\n", __func__);
2806+
}
27892807
} break;
27902808
case PROJECTOR_TYPE_LLAMA4:
27912809
{
@@ -2810,6 +2828,13 @@ struct clip_model_loader {
28102828
break;
28112829
}
28122830

2831+
// sanity check
2832+
{
2833+
if (hparams.image_max_pixels < hparams.image_min_pixels) {
2834+
throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
2835+
}
2836+
}
2837+
28132838
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
28142839
LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
28152840
LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
@@ -2826,10 +2851,10 @@ struct clip_model_loader {
28262851
LOG_INF("%s: n_merge: %d\n", __func__, hparams.n_merge);
28272852
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
28282853
if (hparams.image_min_pixels > 0) {
2829-
LOG_INF("%s: image_min_pixels: %d\n", __func__, hparams.image_min_pixels);
2854+
LOG_INF("%s: image_min_pixels: %d%s\n", __func__, hparams.image_min_pixels, hparams.custom_image_min_tokens > 0 ? " (custom value)" : "");
28302855
}
28312856
if (hparams.image_max_pixels > 0) {
2832-
LOG_INF("%s: image_max_pixels: %d\n", __func__, hparams.image_max_pixels);
2857+
LOG_INF("%s: image_max_pixels: %d%s\n", __func__, hparams.image_max_pixels, hparams.custom_image_max_tokens > 0 ? " (custom value)" : "");
28332858
}
28342859
} else if (is_audio) {
28352860
LOG_INF("\n--- audio hparams ---\n");
@@ -4169,7 +4194,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
41694194
case PROJECTOR_TYPE_QWEN25VL:
41704195
case PROJECTOR_TYPE_QWEN3VL:
41714196
{
4172-
// step 1: make a blank canvas which aligns to the grid
4197+
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
41734198
clip_image_u8 resized;
41744199
const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
41754200
original_size,
@@ -4262,7 +4287,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
42624287
case PROJECTOR_TYPE_PIXTRAL:
42634288
case PROJECTOR_TYPE_LIGHTONOCR:
42644289
{
4265-
GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
4290+
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
42664291
clip_image_u8 resized_image;
42674292
// the original pixtral model doesn't have n_merge
42684293
const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
@@ -4296,7 +4321,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
42964321
case PROJECTOR_TYPE_LFM2:
42974322
case PROJECTOR_TYPE_KIMIVL:
42984323
{
4299-
GGML_ASSERT(params.image_min_pixels && params.image_max_pixels);
4324+
GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
43004325
const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
43014326
original_size,
43024327
params.patch_size * params.n_merge,

tools/mtmd/clip.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ struct clip_context_params {
3333
bool use_gpu;
3434
enum ggml_log_level verbosity;
3535
enum clip_flash_attn_type flash_attn_type;
36+
int image_min_tokens;
37+
int image_max_tokens;
3638
};
3739

3840
struct clip_init_result {

tools/mtmd/mtmd-cli.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -132,11 +132,13 @@ struct mtmd_cli_context {
132132
void init_vision_context(common_params & params) {
133133
const char * clip_path = params.mmproj.path.c_str();
134134
mtmd_context_params mparams = mtmd_context_params_default();
135-
mparams.use_gpu = params.mmproj_use_gpu;
136-
mparams.print_timings = true;
137-
mparams.n_threads = params.cpuparams.n_threads;
138-
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
139-
mparams.flash_attn_type = params.flash_attn_type;
135+
mparams.use_gpu = params.mmproj_use_gpu;
136+
mparams.print_timings = true;
137+
mparams.n_threads = params.cpuparams.n_threads;
138+
mparams.verbosity = params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
139+
mparams.flash_attn_type = params.flash_attn_type;
140+
mparams.image_min_tokens = params.image_min_tokens;
141+
mparams.image_max_tokens = params.image_max_tokens;
140142
ctx_vision.reset(mtmd_init_from_file(clip_path, model, mparams));
141143
if (!ctx_vision.get()) {
142144
LOG_ERR("Failed to load vision model from %s\n", clip_path);

tools/mtmd/mtmd.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,8 @@ mtmd_context_params mtmd_context_params_default() {
109109
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
110110
params.media_marker = mtmd_default_marker();
111111
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
112+
params.image_min_tokens = -1;
113+
params.image_max_tokens = -1;
112114
return params;
113115
}
114116

@@ -171,9 +173,13 @@ struct mtmd_context {
171173
}
172174

173175
clip_context_params ctx_clip_params;
174-
ctx_clip_params.use_gpu = ctx_params.use_gpu;
175-
ctx_clip_params.verbosity = ctx_params.verbosity;
176-
ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
176+
ctx_clip_params.use_gpu = ctx_params.use_gpu;
177+
ctx_clip_params.verbosity = ctx_params.verbosity;
178+
ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
179+
// custom image token limits
180+
ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens;
181+
ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens;
182+
177183
auto res = clip_init(mmproj_fname, ctx_clip_params);
178184
ctx_v = res.ctx_v;
179185
ctx_a = res.ctx_a;

tools/mtmd/mtmd.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ struct mtmd_context_params {
8383
const char * image_marker; // deprecated, use media_marker instead
8484
const char * media_marker;
8585
enum llama_flash_attn_type flash_attn_type;
86+
87+
// limit number of image tokens, only for vision models with dynamic resolution
88+
int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
89+
int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
8690
};
8791

8892
MTMD_API const char * mtmd_default_marker(void);

tools/server/server.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2452,11 +2452,13 @@ struct server_context {
24522452
std::string & mmproj_path = params_base.mmproj.path;
24532453
if (!mmproj_path.empty()) {
24542454
mtmd_context_params mparams = mtmd_context_params_default();
2455-
mparams.use_gpu = params_base.mmproj_use_gpu;
2456-
mparams.print_timings = false;
2457-
mparams.n_threads = params_base.cpuparams.n_threads;
2458-
mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
2459-
mparams.flash_attn_type = params_base.flash_attn_type;
2455+
mparams.use_gpu = params_base.mmproj_use_gpu;
2456+
mparams.print_timings = false;
2457+
mparams.n_threads = params_base.cpuparams.n_threads;
2458+
mparams.verbosity = params_base.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
2459+
mparams.flash_attn_type = params_base.flash_attn_type;
2460+
mparams.image_min_tokens = params_base.image_min_tokens;
2461+
mparams.image_max_tokens = params_base.image_max_tokens;
24602462
mctx = mtmd_init_from_file(mmproj_path.c_str(), model, mparams);
24612463
if (mctx == nullptr) {
24622464
SRV_ERR("failed to load multimodal model, '%s'\n", mmproj_path.c_str());

0 commit comments

Comments
 (0)