Skip to content

Commit 33fb59a

Browse files
committed
support moe variant
1 parent 8700158 commit 33fb59a

File tree

6 files changed

+48
-14
lines changed

6 files changed

+48
-14
lines changed

convert_hf_to_gguf.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8051,6 +8051,17 @@ def set_gguf_parameters(self):
80518051
if (num_nextn_predict_layers := self.hparams.get("num_nextn_predict_layers")) is not None:
80528052
self.gguf_writer.add_nextn_predict_layers(num_nextn_predict_layers)
80538053

8054+
# handle M-RoPE, the same as Qwen-VL
8055+
# note: unlike GLM4 non-MoE, we don't need to permute the weights here since GLM4_MOE uses Neox ordering already
8056+
rope_scaling = self.hparams.get("rope_scaling") or self.hparams.get("rope_parameters") or {}
8057+
if "mrope_section" in rope_scaling:
8058+
mrope_section = rope_scaling["mrope_section"]
8059+
# Pad to 4 dimensions [time, height, width, extra]
8060+
while len(mrope_section) < 4:
8061+
mrope_section.append(0)
8062+
self.gguf_writer.add_rope_dimension_sections(mrope_section[:4])
8063+
logger.info(f"MRoPE sections: {mrope_section[:4]}")
8064+
80548065
_experts: list[dict[str, Tensor]] | None = None
80558066

80568067
def modify_tensors(

src/llama-hparams.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,3 +241,7 @@ float llama_hparams::yarn_attn_factor_adjust(float attn_factor, float freq_scale
241241

242242
return attn_factor;
243243
}
244+
245+
bool llama_hparams::use_mrope() const {
246+
return rope_sections[0] > 0 && rope_sections[1] > 0;
247+
}

src/llama-hparams.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,8 @@ struct llama_hparams {
275275
// ref: https://github.com/ggml-org/llama.cpp/discussions/7416
276276
// https://github.com/ggml-org/llama.cpp/pull/17945
277277
static float yarn_attn_factor_adjust(float attn_factor, float freq_scale, float ext_factor);
278+
279+
bool use_mrope() const;
278280
};
279281

280282
static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");

src/llama-model.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7838,9 +7838,9 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
78387838
return LLAMA_ROPE_TYPE_IMROPE;
78397839

78407840
case LLM_ARCH_GLM4:
7841-
return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
7841+
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NORM;
78427842
case LLM_ARCH_GLM4_MOE:
7843-
return model->hparams.rope_sections[0] ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
7843+
return model->hparams.use_mrope() ? LLAMA_ROPE_TYPE_MROPE : LLAMA_ROPE_TYPE_NEOX;
78447844

78457845
// all model arches should be listed explicitly here
78467846
case LLM_ARCH_UNKNOWN:

src/models/glm4-moe.cpp

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,20 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
55

66
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
77

8+
int sections[4];
9+
std::copy(std::begin(hparams.rope_sections), std::begin(hparams.rope_sections) + 4, sections);
10+
811
ggml_tensor * cur;
912
ggml_tensor * inpL;
1013

1114
inpL = build_inp_embd(model.tok_embd);
1215

16+
bool use_mrope = hparams.use_mrope();
17+
if (ubatch.embd && !use_mrope) {
18+
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
19+
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");
20+
}
21+
1322
// inp_pos - contains the positions
1423
ggml_tensor * inp_pos = build_inp_pos();
1524

@@ -60,17 +69,25 @@ llm_build_glm4_moe::llm_build_glm4_moe(const llama_model & model, const llm_grap
6069
Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
6170
cb(Kcur, "Kcur_normed", il);
6271
}
63-
Qcur = ggml_rope_ext(
64-
ctx0, Qcur, inp_pos, nullptr,
65-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
66-
ext_factor, attn_factor, beta_fast, beta_slow
67-
);
68-
69-
Kcur = ggml_rope_ext(
70-
ctx0, Kcur, inp_pos, nullptr,
71-
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
72-
ext_factor, attn_factor, beta_fast, beta_slow
73-
);
72+
73+
if (use_mrope) {
74+
Qcur = ggml_rope_multi(ctx0, Qcur, inp_pos, nullptr,
75+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
76+
ext_factor, attn_factor, beta_fast, beta_slow);
77+
78+
Kcur = ggml_rope_multi(ctx0, Kcur, inp_pos, nullptr,
79+
n_rot, sections, rope_type, n_ctx_orig, freq_base, freq_scale,
80+
ext_factor, attn_factor, beta_fast, beta_slow);
81+
} else {
82+
// Normal RoPE
83+
Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot,
84+
rope_type, n_ctx_orig, freq_base, freq_scale,
85+
ext_factor, attn_factor, beta_fast, beta_slow);
86+
87+
Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot,
88+
rope_type, n_ctx_orig, freq_base, freq_scale,
89+
ext_factor, attn_factor, beta_fast, beta_slow);
90+
}
7491

7592
cb(Qcur, "Qcur", il);
7693
cb(Kcur, "Kcur", il);

src/models/glm4.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ llm_build_glm4::llm_build_glm4(const llama_model & model, const llm_graph_params
1616

1717
inpL = build_inp_embd(model.tok_embd);
1818

19-
bool use_mrope = rope_type & LLAMA_ROPE_TYPE_MROPE;
19+
bool use_mrope = hparams.use_mrope();
2020
if (ubatch.embd && !use_mrope) {
2121
// unfortunately, we need to forcefully stop here, to avoid users complaining about wrong results
2222
GGML_ABORT("This GGUF does not support multimodal. Please reconvert it.");

0 commit comments

Comments
 (0)