support moe variant

ngxson · ngxson · commit 33fb59ab11d4 · 2025-12-15T14:42:44.000+01:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -8051,6 +8051,17 @@ def set_gguf_parameters(self):
         if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
             self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
 
+        # handle M-RoPE, the same as Qwen-VL
+        # note: unlike GLM4 non-MoE, we don't need to permute the weights here since GLM4_MOE uses Neox ordering already
+        rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {}
+        if "mrope_section" in rope_scaling:
+            mrope_section = rope_scaling["mrope_section"]
+            # Pad to 4 dimensions [time, height, width, extra]
+            while len(mrope_section) < 4:
+                mrope_section.append(0)
+            self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
+            logger.info(f"MRoPE sections: {mrope_section[:4]}")
+
     _experts: list[dict[str, Tensor]] | None = None
 
     def modify_tensors(
diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp
@@ -241,3 +241,7 @@ float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale
 
     return attn_factor;
 }
+
+bool llama_hparams::use_mrope() const {
+    return rope_sections[0] > 0 && rope_sections[1] > 0;
+}
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
@@ -275,6 +275,8 @@ struct llama_hparams {
     // ref: https://github.com/ggml-org/llama.cpp/discussions/7416
     //      https://github.com/ggml-org/llama.cpp/pull/17945
     static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
+
+    bool use_mrope() const;
 };
 
 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -7838,9 +7838,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
             return LLAMA_ROPE_TYPE_IMROPE;
 
         case LLM_ARCH_GLM4:
-            return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
+            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
         case LLM_ARCH_GLM4_MOE:
-            return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
+            return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
 
         // all model arches should be listed explicitly here
         case LLM_ARCH_UNKNOWN:
diff --git a/src/models/glm4-moe.cpp b/src/models/glm4-moe.cpp
@@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
 
     GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
+    int sections[4];
+    std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
+
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
     inpL = build_inp_embd(model.tok_embd);
 
+    bool use_mrope = hparams.use_mrope();
+    if (ubatch.embd && !use_mrope) {
+        // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
+        GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
+    }
+
     // inp_pos - contains the positions
     ggml_tensor * inp_pos = build_inp_pos();
 
@@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
                 Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
                 cb(Kcur, "Kcur_normed", il);
             }
-            Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
-
-            Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
-                    ext_factor, attn_factor, beta_fast, beta_slow
-                    );
+
+            if (use_mrope) {
+                Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
+                            n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
+                            ext_factor, attn_factor, beta_fast, beta_slow);
+            } else {
+                // Normal RoPE
+                Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+
+                Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
+                                    rope_type, n_ctx_orig, freq_base, freq_scale,
+                                    ext_factor, attn_factor, beta_fast, beta_slow);
+            }
 
             cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
diff --git a/src/models/glm4.cpp b/src/models/glm4.cpp
@@ -16,7 +16,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
 
     inpL = build_inp_embd(model.tok_embd);
 
-    bool use_mrope = rope_type & LLAMA_ROPE_TYPE_MROPE;
+    bool use_mrope = hparams.use_mrope();
     if (ubatch.embd && !use_mrope) {
         // unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
         GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");

Original file line number	Diff line number	Diff line change
`@@ -241,3 +241,7 @@ float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale`
`241`	`241`
`242`	`242`	`return attn_factor;`
`243`	`243`	`}`
	`244`	`+`
	`245`	`+bool llama_hparams::use_mrope() const {`
	`246`	`+ return rope_sections[0] > 0 && rope_sections[1] > 0;`
	`247`	`+}`