From a8935c9595244a9877f07a5f55dca081f9f8ab17 Mon Sep 17 00:00:00 2001 From: lzhang Date: Fri, 18 Jul 2025 14:20:29 +0800 Subject: [PATCH 1/3] Fix MinicpmV model converter and clip to avoid using hardcode. --- tools/mtmd/clip-impl.h | 2 + tools/mtmd/clip.cpp | 62 ++++++---- .../minicpmv-convert-image-encoder-to-gguf.py | 107 +++++++++++++----- tools/mtmd/legacy-models/minicpmv-surgery.py | 2 + 4 files changed, 123 insertions(+), 50 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 62c936ed00f77..133e13eea461d 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -44,6 +44,8 @@ #define KEY_WIN_ATTN_PATTERN "clip.vision.n_wa_pattern" #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" +#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" +#define KEY_MINICPMV_PROJECTION_DIM "clip.minicpmv_projection_dim" // audio-specific #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9146c9e9c4481..2461a21a1d2b4 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -201,6 +201,8 @@ struct clip_hparams { // legacy bool has_llava_projector = false; int minicpmv_version = 0; + int32_t minicpmv_query_num = 0; // MiniCPM-V query number + int32_t minicpmv_projection_dim = 0; // MiniCPM-V projection dimension }; struct clip_layer { @@ -847,13 +849,19 @@ struct clip_graph { int n_embd = clip_n_mmproj_embd(ctx); const int d_head = 128; int n_head = n_embd/d_head; + // Use actual config value if available, otherwise fall back to hardcoded values int num_query = 96; - if (ctx->model.hparams.minicpmv_version == 2) { - num_query = 96; - } else if (ctx->model.hparams.minicpmv_version == 3) { - num_query = 64; - } else if (ctx->model.hparams.minicpmv_version == 4) { - num_query = 64; + if (ctx->model.hparams.minicpmv_query_num > 0) { + num_query = ctx->model.hparams.minicpmv_query_num; + } else { + // Fallback to hardcoded values for legacy models + if (ctx->model.hparams.minicpmv_version == 2) { + num_query = 96; + } else if (ctx->model.hparams.minicpmv_version == 3) { + num_query = 64; + } else if (ctx->model.hparams.minicpmv_version == 4) { + num_query = 64; + } } ggml_tensor * Q = ggml_add(ctx0, @@ -2110,6 +2118,8 @@ struct clip_model_loader { get_u32(KEY_PATCH_SIZE, hparams.patch_size); get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy + get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false); + get_u32(KEY_MINICPMV_PROJECTION_DIM, hparams.minicpmv_projection_dim, false); } else if (is_audio) { get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins); @@ -3517,14 +3527,20 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_MINICPMV: { - if (params.minicpmv_version == 2) { - n_patches_sq = 96; - } else if (params.minicpmv_version == 3) { - n_patches_sq = 64; - } else if (params.minicpmv_version == 4) { - n_patches_sq = 64; + // Use actual config value if available, otherwise fall back to hardcoded values + if (params.minicpmv_query_num > 0) { + n_patches_sq = params.minicpmv_query_num; } else { - GGML_ABORT("Unknown minicpmv version"); + // Fallback to hardcoded values for legacy models + if (params.minicpmv_version == 2) { + n_patches_sq = 96; + } else if (params.minicpmv_version == 3) { + n_patches_sq = 64; + } else if (params.minicpmv_version == 4) { + n_patches_sq = 64; + } else { + GGML_ABORT("Unknown minicpmv version"); + } } } break; case PROJECTOR_TYPE_QWEN2VL: @@ -4059,14 +4075,20 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_MLP_NORM: return ctx->model.mm_3_b->ne[0]; case PROJECTOR_TYPE_MINICPMV: - if (hparams.minicpmv_version == 2) { - return 4096; - } else if (hparams.minicpmv_version == 3) { - return 3584; - } else if (hparams.minicpmv_version == 4) { - return 3584; + // Use actual config value if available, otherwise fall back to hardcoded values + if (hparams.minicpmv_projection_dim > 0) { + return hparams.minicpmv_projection_dim; + } else { + // Fallback to hardcoded values for legacy models + if (hparams.minicpmv_version == 2) { + return 4096; + } else if (hparams.minicpmv_version == 3) { + return 3584; + } else if (hparams.minicpmv_version == 4) { + return 3584; + } + GGML_ABORT("Unknown minicpmv version"); } - GGML_ABORT("Unknown minicpmv version"); case PROJECTOR_TYPE_GLM_EDGE: return ctx->model.mm_model_mlp_3_w->ne[1]; case PROJECTOR_TYPE_QWEN2VL: diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index cfe0961f9891a..daa2f39ae1e09 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -517,6 +517,16 @@ def bytes_to_unicode(): # output in the same directory as the model if output_dir is None dir_model = args.model_dir +# Read config.json to get actual model configuration +config_path = os.path.join(dir_model, "config.json") +model_config = {} +if os.path.exists(config_path): + with open(config_path, "r", encoding="utf-8") as f: + model_config = json.load(f) + print(f"Loaded config from {config_path}") +else: + print(f"Warning: config.json not found at {config_path}") + if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: vocab = None tokens = None @@ -544,34 +554,59 @@ def bytes_to_unicode(): # processor = CLIPProcessor.from_pretrained(dir_model) minicpmv_version = args.minicpmv_version -emb_dim = 4096 -block_count = 26 -if minicpmv_version == 1: - emb_dim = 2304 - block_count = 26 -elif minicpmv_version == 2: - emb_dim = 4096 - block_count = 27 -elif minicpmv_version == 3: - emb_dim = 3584 - block_count = 27 -elif minicpmv_version == 4: - emb_dim = 3584 - block_count = 27 - -default_vision_config = { - "hidden_size": 1152, - "image_size": 980, - "intermediate_size": 4304, - "model_type": "idefics2", - "num_attention_heads": 16, - "num_hidden_layers": 27, - "patch_size": 14, + +# Use actual config values instead of hardcoded ones +if model_config: + # For the projector/resampler, use the main model's hidden_size + emb_dim = model_config.get("hidden_size", 1536) + + # For the vision model, use vision_config values + vision_config_dict = model_config.get("vision_config", {}) + default_vision_config = { + "hidden_size": vision_config_dict.get("hidden_size", 1152), + "image_size": vision_config_dict.get("image_size", 980), + "intermediate_size": vision_config_dict.get("intermediate_size", 4304), + "model_type": vision_config_dict.get("model_type", "siglip"), + "num_attention_heads": vision_config_dict.get("num_attention_heads", 16), + "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27), + "patch_size": vision_config_dict.get("patch_size", 14), } + # Use vision model's num_hidden_layers for block_count + block_count = vision_config_dict.get("num_hidden_layers", 27) + + print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}") + print(f"Vision config: {default_vision_config}") +else: + # Fallback to original hardcoded logic if config.json not found + emb_dim = 4096 + block_count = 26 + if minicpmv_version == 1: + emb_dim = 2304 + block_count = 26 + elif minicpmv_version == 2: + emb_dim = 4096 + block_count = 27 + elif minicpmv_version == 3: + emb_dim = 3584 + block_count = 27 + elif minicpmv_version == 4: + emb_dim = 3584 + block_count = 27 + + default_vision_config = { + "hidden_size": 1152, + "image_size": 980, + "intermediate_size": 4304, + "model_type": "idefics2", + "num_attention_heads": 16, + "num_hidden_layers": 27, + "patch_size": 14, + } + vision_config = Idefics2VisionConfig(**default_vision_config) model = Idefics2VisionTransformer(vision_config) -if minicpmv_version == 3: +if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"): vision_config = SiglipVisionConfig(**default_vision_config) model = SiglipVisionTransformer(vision_config) elif minicpmv_version == 4: @@ -626,16 +661,28 @@ def bytes_to_unicode(): fout.add_description("two-tower CLIP model") if has_vision_encoder: - # vision_model hparams - fout.add_uint32("clip.vision.image_size", 448) - fout.add_uint32("clip.vision.patch_size", 14) - fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152) - fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304) + # vision_model hparams - use actual config values + vision_image_size = model_config.get("image_size", 448) if model_config else 448 + vision_patch_size = default_vision_config.get("patch_size", 14) + vision_hidden_size = default_vision_config.get("hidden_size", 1152) + vision_intermediate_size = default_vision_config.get("intermediate_size", 4304) + vision_attention_heads = default_vision_config.get("num_attention_heads", 16) + + fout.add_uint32("clip.vision.image_size", vision_image_size) + fout.add_uint32("clip.vision.patch_size", vision_patch_size) + fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size) + fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size) fout.add_uint32("clip.vision.projection_dim", 0) - fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16) + fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads) fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6) fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count) + # Add MiniCPM-V specific parameters + query_num = model_config.get("query_num", 0) if model_config else 0 + resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0 + fout.add_uint32("clip.minicpmv_query_num", query_num) + fout.add_uint32("clip.minicpmv_projection_dim", resampler_emb_dim) + if processor is not None: image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std diff --git a/tools/mtmd/legacy-models/minicpmv-surgery.py b/tools/mtmd/legacy-models/minicpmv-surgery.py index ba82116582b1f..53526623cd7cb 100644 --- a/tools/mtmd/legacy-models/minicpmv-surgery.py +++ b/tools/mtmd/legacy-models/minicpmv-surgery.py @@ -16,6 +16,8 @@ # store these tensors in a new dictionary and torch.save them projector = {name: checkpoint[name].float() for name in mm_tensors} +if 'resampler.proj' in projector.keys() and hasattr(model.llm.config,'scale_emb') is True: + projector['resampler.proj'] = projector['resampler.proj'] / model.llm.config.scale_emb torch.save(projector, f"{args.model}/minicpmv.projector") clip_tensors = [k for k, v in checkpoint.items() if k.startswith("vpm")] From 0fec64bd43817c10c372c5d35f5685f6030514a8 Mon Sep 17 00:00:00 2001 From: lzhang Date: Mon, 28 Jul 2025 13:48:44 +0800 Subject: [PATCH 2/3] Code update for pr/14750 --- tools/mtmd/clip-impl.h | 2 +- tools/mtmd/clip.cpp | 44 +++++++++++++----------------------------- 2 files changed, 14 insertions(+), 32 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 133e13eea461d..6db4d279f10b3 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -45,7 +45,7 @@ #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" -#define KEY_MINICPMV_PROJECTION_DIM "clip.minicpmv_projection_dim" +#define KEY_MINICPMV_PROJ_DIM "clip.minicpmv_projection_dim" // audio-specific #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 2461a21a1d2b4..1615c5add94a8 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -850,20 +850,7 @@ struct clip_graph { const int d_head = 128; int n_head = n_embd/d_head; // Use actual config value if available, otherwise fall back to hardcoded values - int num_query = 96; - if (ctx->model.hparams.minicpmv_query_num > 0) { - num_query = ctx->model.hparams.minicpmv_query_num; - } else { - // Fallback to hardcoded values for legacy models - if (ctx->model.hparams.minicpmv_version == 2) { - num_query = 96; - } else if (ctx->model.hparams.minicpmv_version == 3) { - num_query = 64; - } else if (ctx->model.hparams.minicpmv_version == 4) { - num_query = 64; - } - } - + int num_query = ctx->model.hparams.minicpmv_query_num; ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b); @@ -2119,8 +2106,17 @@ struct clip_model_loader { get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false); - get_u32(KEY_MINICPMV_PROJECTION_DIM, hparams.minicpmv_projection_dim, false); - + get_u32(KEY_MINICPMV_PROJ_DIM, hparams.minicpmv_projection_dim, false); + if (hparams.minicpmv_query_num == 0) { + // Fallback to hardcoded values for legacy models + if (hparams.minicpmv_version == 3) { + hparams.minicpmv_query_num = 64; + } else if (hparams.minicpmv_version == 4) { + hparams.minicpmv_query_num = 64; + } else { + hparams.minicpmv_query_num = 96; + } + } } else if (is_audio) { get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins); @@ -4063,7 +4059,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } int clip_n_mmproj_embd(const struct clip_ctx * ctx) { - const auto & hparams = ctx->model.hparams; switch (ctx->model.proj_type) { case PROJECTOR_TYPE_LDP: return ctx->model.mm_model_block_1_block_2_1_b->ne[0]; @@ -4075,20 +4070,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_MLP_NORM: return ctx->model.mm_3_b->ne[0]; case PROJECTOR_TYPE_MINICPMV: - // Use actual config value if available, otherwise fall back to hardcoded values - if (hparams.minicpmv_projection_dim > 0) { - return hparams.minicpmv_projection_dim; - } else { - // Fallback to hardcoded values for legacy models - if (hparams.minicpmv_version == 2) { - return 4096; - } else if (hparams.minicpmv_version == 3) { - return 3584; - } else if (hparams.minicpmv_version == 4) { - return 3584; - } - GGML_ABORT("Unknown minicpmv version"); - } + return ctx->model.mm_model_proj->ne[0]; case PROJECTOR_TYPE_GLM_EDGE: return ctx->model.mm_model_mlp_3_w->ne[1]; case PROJECTOR_TYPE_QWEN2VL: From d432b2130d9db6f94abd36b72262e5cbdf5050de Mon Sep 17 00:00:00 2001 From: lzhang Date: Tue, 29 Jul 2025 21:35:22 +0800 Subject: [PATCH 3/3] Remove unused field, update script path in docs. --- docs/multimodal/minicpmo2.6.md | 6 +++--- docs/multimodal/minicpmv2.5.md | 4 ++-- docs/multimodal/minicpmv2.6.md | 6 +++--- tools/mtmd/clip-impl.h | 1 - tools/mtmd/clip.cpp | 2 -- .../legacy-models/minicpmv-convert-image-encoder-to-gguf.py | 3 +-- 6 files changed, 9 insertions(+), 13 deletions(-) diff --git a/docs/multimodal/minicpmo2.6.md b/docs/multimodal/minicpmo2.6.md index 8c6db8efe5b53..5b213dde2ef75 100644 --- a/docs/multimodal/minicpmo2.6.md +++ b/docs/multimodal/minicpmo2.6.md @@ -13,7 +13,7 @@ If there are differences in usage, please refer to the official build [documenta Clone llama.cpp: ```bash -git clone https://github.com/ggerganov/llama.cpp +git clone https://github.com/ggml-org/llama.cpp cd llama.cpp ``` @@ -29,8 +29,8 @@ cmake --build build --config Release Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us) ```bash -python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6 -python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4 +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-2_6 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4 python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model # quantize int4 version diff --git a/docs/multimodal/minicpmv2.5.md b/docs/multimodal/minicpmv2.5.md index 19b439607d44c..c995764082171 100644 --- a/docs/multimodal/minicpmv2.5.md +++ b/docs/multimodal/minicpmv2.5.md @@ -28,8 +28,8 @@ cmake --build build --config Release Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) ```bash -python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 -python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2 +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model # quantize int4 version diff --git a/docs/multimodal/minicpmv2.6.md b/docs/multimodal/minicpmv2.6.md index 15c1bbd12ebcb..f3796ea12ac9e 100644 --- a/docs/multimodal/minicpmv2.6.md +++ b/docs/multimodal/minicpmv2.6.md @@ -12,7 +12,7 @@ If there are differences in usage, please refer to the official build [documenta Clone llama.cpp: ```bash -git clone https://github.com/ggerganov/llama.cpp +git clone https://github.com/ggml-org/llama.cpp cd llama.cpp ``` @@ -28,8 +28,8 @@ cmake --build build --config Release Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us) ```bash -python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6 -python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3 +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-2_6 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model # quantize int4 version diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 6db4d279f10b3..921d5445203c5 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -45,7 +45,6 @@ #define KEY_ATTN_WINDOW_SIZE "clip.vision.window_size" #define KEY_MINICPMV_VERSION "clip.minicpmv_version" #define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num" -#define KEY_MINICPMV_PROJ_DIM "clip.minicpmv_projection_dim" // audio-specific #define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 1615c5add94a8..04c6839411ad2 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -202,7 +202,6 @@ struct clip_hparams { bool has_llava_projector = false; int minicpmv_version = 0; int32_t minicpmv_query_num = 0; // MiniCPM-V query number - int32_t minicpmv_projection_dim = 0; // MiniCPM-V projection dimension }; struct clip_layer { @@ -2106,7 +2105,6 @@ struct clip_model_loader { get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false); get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false); - get_u32(KEY_MINICPMV_PROJ_DIM, hparams.minicpmv_projection_dim, false); if (hparams.minicpmv_query_num == 0) { // Fallback to hardcoded values for legacy models if (hparams.minicpmv_version == 3) { diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index daa2f39ae1e09..6c156e2c25624 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -520,7 +520,7 @@ def bytes_to_unicode(): # Read config.json to get actual model configuration config_path = os.path.join(dir_model, "config.json") model_config = {} -if os.path.exists(config_path): +if os.path.isfile(config_path): with open(config_path, "r", encoding="utf-8") as f: model_config = json.load(f) print(f"Loaded config from {config_path}") @@ -681,7 +681,6 @@ def bytes_to_unicode(): query_num = model_config.get("query_num", 0) if model_config else 0 resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0 fout.add_uint32("clip.minicpmv_query_num", query_num) - fout.add_uint32("clip.minicpmv_projection_dim", resampler_emb_dim) if processor is not None: image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean