From 8e4a0fc926e80d3528d6f2d0f19eb82e88de9f16 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Wed, 2 Jul 2025 14:07:24 +0200 Subject: [PATCH 01/19] Improve Mistral models integration with llama.cpp --- convert_hf_to_gguf.py | 4 +- convert_mistral_to_gguf.py | 1118 +++++++++++++++++ gguf-py/gguf/tensor_mapping.py | 39 +- gguf-py/gguf/utility.py | 20 +- gguf-py/gguf/vocab.py | 249 ++-- pyproject.toml | 1 + requirements.txt | 1 + requirements/requirements-all.txt | 1 + .../requirements-convert_mistral_to_gguf.txt | 13 + 9 files changed, 1337 insertions(+), 109 deletions(-) create mode 100755 convert_mistral_to_gguf.py create mode 100644 requirements/requirements-convert_mistral_to_gguf.txt diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3f5cefe007cca..b1d2d09dec50a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -104,9 +104,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_model(remote_hf_model_id) self.tensor_names = set(name for name in remote_tensors.keys()) - for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items(): + for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_model(remote_hf_model_id).items(): yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) self.get_tensors = get_remote_tensors diff --git a/convert_mistral_to_gguf.py b/convert_mistral_to_gguf.py new file mode 100755 index 0000000000000..3da712f82a566 --- /dev/null +++ b/convert_mistral_to_gguf.py @@ -0,0 +1,1118 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from __future__ import annotations + +import logging +import argparse +import json +import os +import sys +from enum import IntEnum +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + ContextManager, + Iterable, + Iterator, + Sequence, + Type, + cast, +) + +import numpy as np +import torch + +from gguf.constants import MODEL_ARCH, MODEL_ARCH_NAMES +from gguf.vocab import MistralTokenizerType, MistralVocab +from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD + +if TYPE_CHECKING: + from torch import Tensor + +if "NO_LOCAL_GGUF" not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) +import gguf + +logger = logging.getLogger("mistral-to-gguf") + + +###### MODEL DEFINITIONS ###### + + +class SentencePieceTokenTypes(IntEnum): + NORMAL = 1 + UNKNOWN = 2 + CONTROL = 3 + USER_DEFINED = 4 + UNUSED = 5 + BYTE = 6 + + +class ModelType(IntEnum): + TEXT = 1 + MMPROJ = 2 + + +class ModelBase: + dir_model: Path + ftype: gguf.LlamaFileType + fname_out: Path + is_big_endian: bool + endianess: gguf.GGUFEndian + use_temp_file: bool + lazy: bool + hparams: dict[str, Any] + tensor_names: set[str] | None + gguf_writer: gguf.GGUFWriter + model_name: str | None + metadata_override: Path | None + dir_model_card: Path + remote_hf_model_id: str | None + model_arch: MODEL_ARCH + model_type: ModelType + + # subclasses should initialize this! + block_count: int + tensor_map: gguf.TensorNameMap + + def __init__( + self, + dir_model: Path, + ftype: gguf.LlamaFileType, + fname_out: Path, + *, + is_big_endian: bool = False, + use_temp_file: bool = False, + eager: bool = False, + metadata_override: Path | None = None, + model_name: str | None = None, + split_max_tensors: int = 0, + split_max_size: int = 0, + dry_run: bool = False, + small_first_shard: bool = False, + hparams: dict[str, Any] | None = None, + remote_hf_model_id: str | None = None, + ctx: int = 0, + ): + if ( + type(self) is ModelBase + or type(self) is TextModel + or type(self) is MmprojModel + ): + raise TypeError( + f"{type(self).__name__!r} should not be directly instantiated" + ) + + self.ctx = ctx + self.dir_model = dir_model + self.ftype = ftype + self.fname_out = fname_out + self.is_big_endian = is_big_endian + self.endianess = ( + gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE + ) + self.use_temp_file = use_temp_file + self.lazy = not eager or (remote_hf_model_id is not None) + self.remote_hf_model_id = remote_hf_model_id + self.vocab = MistralVocab(self.dir_model) + if remote_hf_model_id is not None: + + def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: + logger.info( + f"Using remote model with HuggingFace id: {remote_hf_model_id}" + ) + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_model( + remote_hf_model_id + ) + self.tensor_names = set(name for name in remote_tensors.keys()) + for ( + name, + remote_tensor, + ) in gguf.utility.SafetensorRemote.get_list_tensors_model( + remote_hf_model_id + ).items(): + yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) + + self.get_tensors = get_remote_tensors + + self.hparams = ( + ModelBase.load_hparams(self.dir_model) if hparams is None else hparams + ) + self.tensor_names = None + self.metadata_override = metadata_override + self.model_name = model_name + self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py + + # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type + if self.ftype == gguf.LlamaFileType.GUESSED: + _, first_tensor = next(self.get_tensors()) + if first_tensor.dtype == torch.float16: + logger.info( + f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})" + ) + self.ftype = gguf.LlamaFileType.MOSTLY_F16 + else: + logger.info( + f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})" + ) + self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + + # Configure GGUF Writer + self.gguf_writer = gguf.GGUFWriter( + path=None, + arch=MODEL_ARCH_NAMES[self.model_arch], + endianess=self.endianess, + use_temp_file=self.use_temp_file, + split_max_tensors=split_max_tensors, + split_max_size=split_max_size, + dry_run=dry_run, + small_first_shard=small_first_shard, + ) + + @classmethod + def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: + stem, suffix = path.stem, path.suffix + new_name = f"{prefix}{stem}{suffix}" + return path.with_name(new_name) + + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: + key = next((k for k in keys if k in self.hparams), None) + if key is not None: + return self.hparams[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + tensor_names_from_parts: set[str] = set() + + self.tensor_names = tensor_names_from_parts + weight_map: dict[str, str] = {} + + logger.info("gguf: loading 'consolidated.satensors'") + ctx: ContextManager[Any] + from safetensors import safe_open + + ctx = cast( + ContextManager[Any], + safe_open( + self.dir_model / "consolidated.safetensors", + framework="pt", + device="cpu", + ), + ) + + with ctx as model_part: + tensor_names_from_parts.update(model_part.keys()) + + for name in model_part.keys(): + if self.lazy: + data = model_part.get_slice(name) + data = LazyTorchTensor.from_safetensors_slice(data) + else: + data = model_part.get_tensor(name) + yield name, data + + # verify tensor name presence and identify potentially missing files + if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: + missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) + extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) + missing_files = sorted( + set(weight_map[n] for n in missing if n in weight_map) + ) + if len(extra) == 0 and len(missing_files) > 0: + raise ValueError( + f"Missing or incomplete model files: {missing_files}\n" + f"Missing tensors: {missing}" + ) + else: + raise ValueError( + "Mismatch between weight map and model parts for tensor names:\n" + f"Missing tensors: {missing}\n" + f"Extra tensors: {extra}" + ) + + def format_tensor_name( + self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight" + ) -> str: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + raise ValueError( + f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}" + ) + name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in name: + assert bid is not None + name = name.format(bid=bid) + return name + suffix + + def match_model_tensor_name( + self, + name: str, + key: gguf.MODEL_TENSOR, + bid: int | None, + suffix: str = ".weight", + ) -> bool: + if key not in gguf.MODEL_TENSORS[self.model_arch]: + return False + key_name: str = gguf.TENSOR_NAMES[key] + if "{bid}" in key_name: + if bid is None: + return False + key_name = key_name.format(bid=bid) + else: + if bid is not None: + return False + return name == (key_name + suffix) + + def map_tensor_name( + self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias") + ) -> str: + new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) + if new_name is None: + raise ValueError(f"Can not map tensor {name!r}") + return new_name + + def set_gguf_parameters(self): + raise NotImplementedError( + "set_gguf_parameters() must be implemented in subclasses" + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len( + ".weight," + ) + + for name, data_torch in self.get_tensors(): + # we don't need these + if name.endswith( + (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") + ): + continue + + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + # use the first number-like part of the tensor name as the block id + bid = None + for part in name.split("."): + if part.isdecimal(): + bid = int(part) + break + + for new_name, data_torch in self.modify_tensors(data_torch, name, bid): + # hard coded for pixtral + if name == "vision_language_adapter.w_in.weight": + assert new_name == "mm.23.weight", new_name + new_name = "mm.1.weight" + elif name == "vision_language_adapter.w_out.weight": + assert new_name == "mm.23.weight", new_name + new_name = "mm.2.weight" + + data = data_torch.numpy() + + # if data ends up empty, it means data_torch was a scalar tensor -> restore + if len(data.shape) == 0: + data = data_torch.numpy() + + n_dims = len(data.shape) + data_qtype: gguf.GGMLQuantizationType | bool = False + + # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors + if n_dims <= 1 or new_name.endswith("_norm.weight"): + data_qtype = gguf.GGMLQuantizationType.F32 + + # Conditions should closely match those in llama_model_quantize_internal in llama.cpp + # Some tensor types are always in float32 + if data_qtype is False and ( + any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.FFN_GATE_INP, + gguf.MODEL_TENSOR.POS_EMBD, + gguf.MODEL_TENSOR.TOKEN_TYPES, + gguf.MODEL_TENSOR.V_ENC_EMBD_POS, + ) + ) + or not new_name.endswith(".weight") + ): + data_qtype = gguf.GGMLQuantizationType.F32 + + if data_qtype is False and any( + self.match_model_tensor_name(new_name, key, bid) + for key in ( + gguf.MODEL_TENSOR.TOKEN_EMBD, + gguf.MODEL_TENSOR.OUTPUT, + ) + ): + if self.ftype in ( + gguf.LlamaFileType.MOSTLY_TQ1_0, + gguf.LlamaFileType.MOSTLY_TQ2_0, + ): + # TODO: use Q4_K and Q6_K + data_qtype = gguf.GGMLQuantizationType.F16 + + # No override (data_qtype is False), or wants to be quantized (data_qtype is True) + if isinstance(data_qtype, bool): + if self.ftype == gguf.LlamaFileType.ALL_F32: + data_qtype = gguf.GGMLQuantizationType.F32 + elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: + data_qtype = gguf.GGMLQuantizationType.F16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: + data_qtype = gguf.GGMLQuantizationType.BF16 + elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: + data_qtype = gguf.GGMLQuantizationType.Q8_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: + data_qtype = gguf.GGMLQuantizationType.TQ1_0 + elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: + data_qtype = gguf.GGMLQuantizationType.TQ2_0 + else: + raise ValueError(f"Unknown file type: {self.ftype.name}") + + try: + data = gguf.quants.quantize(data, data_qtype) + except gguf.QuantError as e: + logger.warning("%s, %s", e, "falling back to F16") + data_qtype = gguf.GGMLQuantizationType.F16 + data = gguf.quants.quantize(data, data_qtype) + + shape = ( + gguf.quant_shape_from_byte_shape(data.shape, data_qtype) + if data.dtype == np.uint8 + else data.shape + ) + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" + + # n_dims is implicit in the shape + logger.info( + f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" + ) + + self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MODEL) + + def prepare_metadata(self): + total_params, shared_params, expert_params, expert_count = ( + self.gguf_writer.get_total_parameter_count() + ) + + self.metadata = gguf.Metadata.load( + self.metadata_override, self.dir_model_card, self.model_name, total_params + ) + + # If we are using HF model id, set the metadata name to the model id + if self.remote_hf_model_id: + self.metadata.name = self.remote_hf_model_id + + # Fallback to model directory name if metadata name is still missing + if self.metadata.name is None: + self.metadata.name = self.dir_model.name + + # Generate parameter weight class (useful for leader boards) if not yet determined + if self.metadata.size_label is None and total_params > 0: + self.metadata.size_label = gguf.size_label( + total_params, shared_params, expert_params, expert_count + ) + + self.set_type() + + logger.info("Set meta model") + self.metadata.set_gguf_meta_model(self.gguf_writer) + + logger.info("Set model parameters") + self.set_gguf_parameters() + + logger.info("Set model quantization version") + self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) + + def write(self): + self.prepare_tensors() + self.prepare_metadata() + self.gguf_writer.write_header_to_file(path=self.fname_out) + self.gguf_writer.write_kv_data_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) + self.gguf_writer.close() + + @staticmethod + def load_hparams(dir_model: Path): + with open(dir_model / "params.json", "r", encoding="utf-8") as f: + config = json.load(f) + return config + + +class TextModel(ModelBase): + model_type = ModelType.TEXT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if "text_config" in self.hparams: + # move the text_config to the root level + self.hparams = {**self.hparams, **self.hparams["text_config"]} + + self.block_count = self.find_hparam( + ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] + ) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + logger.info( + f"Converting tokenizer {self.vocab.tokenizer_type} of size {self.vocab.vocab_size}." + ) + + self.gguf_writer.add_tokenizer_model(self.vocab.gguf_tokenizer_model) + + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in self.vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == self.vocab.vocab_size, ( + f"token count ({len(tokens)}) != vocab size ({self.vocab.vocab_size})" + ) + + if self.vocab.tokenizer_type == MistralTokenizerType.tekken: + self.gguf_writer.add_tokenizer_pre("tekken") + self.gguf_writer.add_token_merges( + self.vocab.extract_vocab_merges_from_model() + ) + + logger.info( + f"Setting bos, eos, unk and pad token IDs to {self.vocab.bos_id}, {self.vocab.eos_id}, {self.vocab.unk_id}, {self.vocab.pad_id}." + ) + + self.gguf_writer.add_bos_token_id(self.vocab.bos_id) + self.gguf_writer.add_eos_token_id(self.vocab.eos_id) + self.gguf_writer.add_unk_token_id(self.vocab.unk_id) + self.gguf_writer.add_pad_token_id(self.vocab.pad_id) + + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_vocab_size(self.vocab.vocab_size) + + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(False) + + def set_vocab_none(self): + logger.info("Skipping tokenizer conversion.") + logger.info("Setting tokenizer to 'none'.") + self.gguf_writer.add_tokenizer_model("none") + + logger.info( + f"Setting bos, eos, unk and pad token IDs to {self.vocab.bos_id}, {self.vocab.eos_id}, {self.vocab.unk_id}, {self.vocab.pad_id}." + ) + self.gguf_writer.add_bos_token_id(self.vocab.bos_id) + self.gguf_writer.add_eos_token_id(self.vocab.eos_id) + self.gguf_writer.add_unk_token_id(self.vocab.unk_id) + self.gguf_writer.add_pad_token_id(self.vocab.pad_id) + + logger.info(f"Setting vocab size to {self.vocab.vocab_size}.") + self.gguf_writer.add_vocab_size(self.vocab.vocab_size) + + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_eos_token(False) + + def prepare_metadata(self): + super().prepare_metadata() + + total_params = self.gguf_writer.get_total_parameter_count()[0] + # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' + output_type: str = self.ftype.name.partition("_")[2] + + # Filename Output + if self.fname_out.is_dir(): + # Generate default filename based on model specification and available metadata + fname_default: str = gguf.naming_convention( + self.metadata.name, + self.metadata.basename, + self.metadata.finetune, + self.metadata.version, + self.metadata.size_label, + output_type, + model_type="LoRA" if total_params < 0 else None, + ) + + # Use the default filename + self.fname_out = self.fname_out / f"{fname_default}.gguf" + else: + # Output path is a custom defined templated filename + # Note: `not is_dir()` is used because `.is_file()` will not detect + # file template strings as it doesn't actually exist as a file + + # Process templated file name with the output ftype, useful with the "auto" ftype + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename( + self.fname_out.name, output_type + ) + + logger.info("Set model tokenizer") + self.set_vocab() + + def set_gguf_parameters(self): + self.gguf_writer.add_block_count(self.block_count) + + if self.ctx == 0: + raise ValueError("ctx not passed as argument") + self.gguf_writer.add_context_length(self.ctx) + logger.info(f"gguf: training context length = {self.ctx}") + + if (n_embd := self.find_hparam(["dim"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") + + if (n_ff := self.find_hparam(["hidden_dim"], optional=True)) is not None: + self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") + + if (n_head := self.find_hparam(["n_heads"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") + + if (n_head_kv := self.hparams.get("n_kv_heads")) is not None: + self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") + + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") + + if (f_norm_eps := self.find_hparam(["norm_eps"], optional=True)) is not None: + self.gguf_writer.add_layer_norm_rms_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") + + if (head_dim := self.hparams.get("head_dim")) is not None: + self.gguf_writer.add_key_length(head_dim) + self.gguf_writer.add_value_length(head_dim) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + +class MmprojModel(ModelBase): + model_type = ModelType.MMPROJ + model_arch = gguf.MODEL_ARCH.MMPROJ + preprocessor_config: dict[str, Any] + global_config: dict[str, Any] + + n_block_keys = ["num_hidden_layers"] + + has_vision_encoder: bool = True + + hparams_vision: dict[str, Any] + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + text_config = { + k: v for k, v in self.hparams.items() if k not in ["vision_encoder"] + } + self.n_embd_text = text_config.get("hidden_dim", 0) + assert self.n_embd_text > 0, "n_embd not found in hparams" + + # move vision config to the top level, while preserving the original hparams in global_config + import copy + + self.global_config = copy.deepcopy(self.hparams) + self.hparams_vision = self.get_vision_config() + + self.block_count = self.hparams_vision.get("num_hidden_layers", 0) + assert self.block_count > 0, "num_hidden_layers not found in vision_config" + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def get_vision_config(self) -> dict[str, Any]: + vision_config = self.global_config.get("vision_encoder") + assert vision_config is not None, "vision_config not found in hparams" + return vision_config + + def set_type(self): + self.gguf_writer.add_type(gguf.GGUFType.MMPROJ) + + def set_gguf_parameters(self): + self.gguf_writer.add_file_type(self.ftype) + + if not self.has_vision_encoder: + raise ValueError("MmprojModel must have a vision encoder") + + def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any: + assert self.hparams_vision is not None + return self._find_param(self.hparams_vision, keys, optional) + + def _find_param( + self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False + ) -> Any: + key = next((k for k in keys if k in obj), None) + if key is not None: + return obj[key] + if optional: + return None + raise KeyError(f"could not find any of: {keys}") + + +class MistralModel(TextModel): + model_name = "mistral" + model_arch = MODEL_ARCH.LLAMA + undo_permute = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + rope_scaling = self.hparams.get("rope_scaling") or {} + if ( + rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" + and "factor" in rope_scaling + ): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return ( + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["n_heads"] + n_kv_head = self.hparams.get("n_kv_heads") + is_vision_tensor = any( + name.startswith(prefix) + for prefix in [ + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ] + ) + + if is_vision_tensor: + return [] # skip vision tensors + + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = self.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = self.permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + + +class PixtralModel(MmprojModel): + model_name = "mistral" + img_break_tok_id = -1 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py + self.hparams["layer_norm_eps"] = self.hparams.get("norm_eps", 1e-5) + self.img_break_tok_id = self.hparams_vision.get("image_break_token_id", -1) + assert self.img_break_tok_id >= 0, ( + "image_break_token_id not found in vision_config" + ) + logger.info(f"Image break token id: {self.img_break_tok_id}") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) + + self.gguf_writer.add_clip_has_vision_encoder(True) + self.gguf_writer.add_vision_projection_dim(self.n_embd_text) + + # vision config + self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"])) + self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"])) + self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"])) + self.gguf_writer.add_vision_feed_forward_length( + self.find_vparam(["intermediate_size"]) + ) + self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys)) + self.gguf_writer.add_vision_head_count( + self.find_vparam(["num_attention_heads"]) + ) + + # preprocessor config + self.gguf_writer.add_vision_image_mean( + self.hparams_vision.get("image_mean", DATASET_MEAN) + ) + self.gguf_writer.add_vision_image_std( + self.hparams_vision.get("image_std", DATASET_STD) + ) + + self.gguf_writer.add_vision_attention_layernorm_eps( + self.find_hparam(["layer_norm_eps"]) + ) + self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"])) + + self.gguf_writer.add_vision_use_silu(True) + + # spatial_merge_size + if self.hparams_vision["mm_projector_id"] == "patch_merge": + self.gguf_writer.add_vision_spatial_merge_size( + self.find_vparam(["spatial_merge_size"]) + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + n_head = self.hparams_vision["num_attention_heads"] + n_kv_head = n_head + + if any( + name.startswith(prefix) + for prefix in [ + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ] + ): + # process vision tensors + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = MistralModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = MistralModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + if self.img_break_tok_id > 0 and "tok_embeddings.weight" in name: + logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") + # for pixtral model, we need to extract the [IMG_BREAK] token embedding + img_break_embd = data_torch[self.img_break_tok_id] + name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] + return [(self.map_tensor_name(name), img_break_embd)] + + return [] # skip other tensors + + +# tree of lazy tensors +class LazyTorchTensor(gguf.LazyBase): + _tensor_type = torch.Tensor + # to keep the type-checker happy + dtype: torch.dtype + shape: torch.Size + + # only used when converting a torch.Tensor to a np.ndarray + _dtype_map: dict[torch.dtype, type] = { + torch.float16: np.float16, + torch.float32: np.float32, + } + + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 + _dtype_str_map: dict[str, torch.dtype] = { + "F64": torch.float64, + "F32": torch.float32, + "BF16": torch.bfloat16, + "F16": torch.float16, + # "U64": torch.uint64, + "I64": torch.int64, + # "U32": torch.uint32, + "I32": torch.int32, + # "U16": torch.uint16, + "I16": torch.int16, + "U8": torch.uint8, + "I8": torch.int8, + "BOOL": torch.bool, + "F8_E4M3": torch.float8_e4m3fn, + "F8_E5M2": torch.float8_e5m2, + } + + def numpy(self) -> gguf.LazyNumpyTensor: + dtype = self._dtype_map[self.dtype] + return gguf.LazyNumpyTensor( + meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), + args=(self,), + func=(lambda s: s.numpy()), + ) + + @classmethod + def meta_with_dtype_and_shape( + cls, dtype: torch.dtype, shape: tuple[int, ...] + ) -> Tensor: + return torch.empty(size=shape, dtype=dtype, device="meta") + + @classmethod + def from_safetensors_slice(cls, st_slice: Any) -> Tensor: + dtype = cls._dtype_str_map[st_slice.get_dtype()] + shape: tuple[int, ...] = tuple(st_slice.get_shape()) + lazy = cls( + meta=cls.meta_with_dtype_and_shape(dtype, shape), + args=(st_slice,), + func=lambda s: s[:], + ) + return cast(torch.Tensor, lazy) + + @classmethod + def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor): + dtype = cls._dtype_str_map[remote_tensor.dtype] + shape = remote_tensor.shape + meta = cls.meta_with_dtype_and_shape(dtype, shape) + lazy = cls( + meta=meta, + args=(remote_tensor,), + func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape), + ) + return cast(torch.Tensor, lazy) + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.Tensor.numpy: + return args[0].numpy() + + return cls._wrap_fn(func)(*args, **kwargs) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Convert a huggingface model to a GGML compatible file" + ) + parser.add_argument( + "--outfile", + type=Path, + help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", + ) + parser.add_argument( + "--outtype", + type=str, + choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], + default="bf16", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", + ) + parser.add_argument( + "--bigendian", + action="store_true", + help="model is executed on big endian machine", + ) + parser.add_argument( + "model", + type=Path, + help="directory containing model file", + nargs="?", + ) + parser.add_argument( + "--ctx-train", + type=int, + help="Training context size", + required=False, + ) + parser.add_argument( + "--use-temp-file", + action="store_true", + help="use the tempfile library while processing (helpful when running out of memory, process killed)", + ) + parser.add_argument( + "--no-lazy", + action="store_true", + help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", + ) + parser.add_argument( + "--model-name", + type=str, + default=None, + help="name of the model", + ) + parser.add_argument( + "--verbose", + action="store_true", + help="increase output verbosity", + ) + parser.add_argument( + "--split-max-tensors", + type=int, + default=0, + help="max tensors in each split", + ) + parser.add_argument( + "--split-max-size", + type=str, + default="0", + help="max size per split N(M|G)", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="only print out a split plan and exit, without writing any new files", + ) + parser.add_argument( + "--no-tensor-first-split", + action="store_true", + help="do not add tensors to the first split (disabled by default)", + ) + parser.add_argument( + "--metadata", + type=Path, + help="Specify the path for an authorship metadata override file", + ) + parser.add_argument( + "--remote", + action="store_true", + help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'mistralai/Mistral-Small-3.2-24B-Instruct-2506'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.", + ) + parser.add_argument( + "--mmproj", + action="store_true", + help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", + ) + + args = parser.parse_args() + return args + + +def split_str_to_n_bytes(split_str: str) -> int: + if split_str.endswith("K"): + n = int(split_str[:-1]) * 1000 + elif split_str.endswith("M"): + n = int(split_str[:-1]) * 1000 * 1000 + elif split_str.endswith("G"): + n = int(split_str[:-1]) * 1000 * 1000 * 1000 + elif split_str.isnumeric(): + n = int(split_str) + else: + raise ValueError( + f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G" + ) + + if n < 0: + raise ValueError(f"Invalid split size: {split_str}, must be positive") + + return n + + +def main() -> None: + args = parse_args() + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.INFO) + + dir_model = args.model + + if args.remote: + from huggingface_hub import snapshot_download + + local_dir = snapshot_download( + repo_id=str(dir_model), + allow_patterns=[ + "LICENSE", + "params.json", + "tekken.json", + "*.md", + "tokenizer.model", + ], + ) + dir_model = Path(local_dir) + logger.info(f"Downloaded config and tokenizer to {local_dir}") + + if not dir_model.is_dir(): + logger.error(f"Error: {args.model} is not a directory") + sys.exit(1) + + ftype_map: dict[str, gguf.LlamaFileType] = { + "f32": gguf.LlamaFileType.ALL_F32, + "f16": gguf.LlamaFileType.MOSTLY_F16, + "bf16": gguf.LlamaFileType.MOSTLY_BF16, + "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, + "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, + "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, + "auto": gguf.LlamaFileType.GUESSED, + } + + is_split = args.split_max_tensors > 0 or args.split_max_size != "0" + if args.use_temp_file and is_split: + logger.error("Error: Cannot use temp file when splitting") + sys.exit(1) + + if args.outfile is not None: + fname_out = args.outfile + elif args.remote: + # if remote, use the model ID as the output file name + fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf") + else: + fname_out = dir_model + + logger.info(f"Loading model: {dir_model.name}") + + with torch.inference_mode(): + output_type = ftype_map[args.outtype] + hparams = ModelBase.load_hparams(dir_model) + model_class: Type[ModelBase] + if args.mmproj and hparams.get("vision_encoder") is not None: + model_class = PixtralModel + elif args.mmproj: + raise ValueError( + "Multimodal projector export is only supported for vision models" + ) + else: + model_class = MistralModel + logger.info(f"Model architecture: {model_class.__name__}") + + model_instance = model_class( + dir_model, + output_type, + fname_out, + is_big_endian=args.bigendian, + use_temp_file=args.use_temp_file, + eager=args.no_lazy, + metadata_override=args.metadata, + model_name=args.model_name, + split_max_tensors=args.split_max_tensors, + split_max_size=split_str_to_n_bytes(args.split_max_size), + dry_run=args.dry_run, + small_first_shard=args.no_tensor_first_split, + remote_hf_model_id=str(args.model) if args.remote else None, + ctx=args.ctx_train, + ) + + logger.info("Exporting model...") + model_instance.write() + out_path = ( + f"{model_instance.fname_out.parent}{os.sep}" + if is_split + else model_instance.fname_out + ) + logger.info(f"Model successfully exported to {out_path}") + + +if __name__ == "__main__": + main() diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index bfd4fd37a3f68..c2ff3ce3a8cd1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1066,6 +1066,8 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ: ( "multi_modal_projector.linear_{bid}", "visual.merger.mlp.{bid}", # qwen2vl + "vision_language_adapter.w_in", # pixtral + "vision_language_adapter.w_out", # pixtral ), MODEL_TENSOR.V_MMPROJ_FC: ( @@ -1091,7 +1093,8 @@ class TensorNameMap: "vision_tower.vision_model.embeddings.patch_embedding", "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM - "vision_tower.patch_conv", # pixtral + "vision_tower.patch_conv", # pixtral-hf + "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 "visual.patch_embed.proj", # qwen2vl ), @@ -1108,7 +1111,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 - "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral + "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral "visual.blocks.{bid}.attn.q", # qwen2vl, generated ), @@ -1121,7 +1125,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 - "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral + "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated ), @@ -1134,7 +1139,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 - "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral + "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral "visual.blocks.{bid}.attn.v", # qwen2vl, generated ), @@ -1143,7 +1149,8 @@ class TensorNameMap: "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM - "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral + "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral "vision_model.model.layers.{bid}.input_layernorm", # llama4 "visual.blocks.{bid}.norm1", # qwen2vl ), @@ -1154,7 +1161,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 - "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral + "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral "visual.blocks.{bid}.attn.proj", # qwen2vl ), @@ -1164,7 +1172,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 - "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral + "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral "visual.blocks.{bid}.norm2", # qwen2vl ), @@ -1172,14 +1181,16 @@ class TensorNameMap: "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 - "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral + "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral "vision_model.model.layers.{bid}.mlp.fc1", # llama4 "visual.blocks.{bid}.mlp.fc1", # qwen2vl "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl ), MODEL_TENSOR.V_ENC_FFN_GATE: ( - "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral + "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl ), @@ -1187,7 +1198,8 @@ class TensorNameMap: "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 - "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral + "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral "vision_model.model.layers.{bid}.mlp.fc2", # llama4 "visual.blocks.{bid}.mlp.fc2", # qwen2vl "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl @@ -1203,7 +1215,8 @@ class TensorNameMap: MODEL_TENSOR.V_PRE_NORM: ( "vision_tower.vision_model.pre_layrnorm", - "vision_tower.ln_pre", # pixtral + "vision_tower.ln_pre", # pixtral-hf + "vision_encoder.ln_pre", # pixtral "vision_model.layernorm_pre", # llama4 ), @@ -1220,6 +1233,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_INP_NORM: ( "multi_modal_projector.norm", + "pre_mm_projector_norm", ), MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( @@ -1275,7 +1289,8 @@ class TensorNameMap: ), MODEL_TENSOR.V_MM_PATCH_MERGER: ( - "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 + "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf + "patch_merger.merging_layer", # mistral ), # audio (mtmd) diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py index 00adcbc937398..8354bd922c1b7 100644 --- a/gguf-py/gguf/utility.py +++ b/gguf-py/gguf/utility.py @@ -111,7 +111,7 @@ class SafetensorRemote: ALIGNMENT = 8 # bytes @classmethod - def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]: + def get_list_tensors_model(cls, model_id: str) -> dict[str, RemoteTensor]: """ Get list of tensors from a Hugging Face model repository. @@ -120,9 +120,13 @@ def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]: """ # case 1: model has only one single model.safetensor file is_single_file = cls.check_file_exist(f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors") + is_single_file_consolidated = cls.check_file_exist(f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/consolidated.safetensors", user_agent="convert_mistral_to_gguf") if is_single_file: url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors" return cls.get_list_tensors(url) + if is_single_file_consolidated: + url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/consolidated.safetensors" + return cls.get_list_tensors(url) # case 2: model has multiple files index_url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors.index.json" @@ -145,7 +149,11 @@ def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]: tensors[key] = val return tensors - raise ValueError(f"Model {model_id} does not have any safetensor files") + raise ValueError( + f"No safetensor file has been found for model {model_id}." + "If the repo has safetensor files, make sure the model is public or you have a " + "valid Hugging Face token set in the environment variable HF_TOKEN." + ) @classmethod def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]: @@ -234,7 +242,7 @@ def get_data_by_range(cls, url: str, start: int, size: int = -1) -> bytes: return response.content[slice(size if size > -1 else None)] @classmethod - def check_file_exist(cls, url: str) -> bool: + def check_file_exist(cls, url: str, user_agent="convert_hf_to_gguf") -> bool: """ Check if a file exists at the given URL. Returns True if the file exists, False otherwise. @@ -247,7 +255,7 @@ def check_file_exist(cls, url: str) -> bool: raise ValueError(f"Invalid URL: {url}") try: - headers = cls._get_request_headers() + headers = cls._get_request_headers(user_agent=user_agent) headers["Range"] = "bytes=0-0" response = requests.head(url, allow_redirects=True, headers=headers) # Success (2xx) or redirect (3xx) @@ -256,9 +264,9 @@ def check_file_exist(cls, url: str) -> bool: return False @classmethod - def _get_request_headers(cls) -> dict[str, str]: + def _get_request_headers(cls, user_agent="convert_hf_to_gguf") -> dict[str, str]: """Prepare common headers for requests.""" - headers = {"User-Agent": "convert_hf_to_gguf"} + headers = {"User-Agent": user_agent} if os.environ.get("HF_TOKEN"): headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}" return headers diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index e1d5aaf47ac46..65916b05d9cb6 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -6,7 +6,16 @@ import json import os from pathlib import Path -from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable +from typing import ( + Any, + Callable, + Sequence, + Mapping, + Iterable, + Protocol, + ClassVar, + runtime_checkable +) try: from sentencepiece import SentencePieceProcessor @@ -36,6 +45,7 @@ from .gguf_writer import GGUFWriter + logger = logging.getLogger(__name__) @@ -46,7 +56,9 @@ class SpecialVocab: chat_template: str | Sequence[Mapping[str, str]] | None def __init__( - self, path: str | os.PathLike[str], load_merges: bool = False, + self, + path: str | os.PathLike[str], + load_merges: bool = False, special_token_types: Iterable[str] | None = None, n_vocab: int | None = None, ): @@ -59,40 +71,60 @@ def __init__( if special_token_types is not None: self.special_token_types = special_token_types else: - self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask') + self.special_token_types = ( + "bos", + "eos", + "unk", + "sep", + "pad", + "cls", + "mask", + ) self._load(Path(path)) def __repr__(self) -> str: - return ''.format( - len(self.merges), self.special_token_ids or "unset", self.add_special_token or "unset", + return "".format( + len(self.merges), + self.special_token_ids or "unset", + self.add_special_token or "unset", ) def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if self.merges: if not quiet: - logger.info(f'Adding {len(self.merges)} merge(s).') + logger.info(f"Adding {len(self.merges)} merge(s).") gw.add_token_merges(self.merges) elif self.load_merges: - logger.warning('Adding merges requested but no merges found, output may be non-functional.') + logger.warning( + "Adding merges requested but no merges found, output may be non-functional." + ) for typ, tokid in self.special_token_ids.items(): - id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) + id_handler: Callable[[int], None] | None = getattr( + gw, f"add_{typ}_token_id", None + ) if id_handler is None: - logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping') + logger.warning( + f"No handler for special token type {typ} with id {tokid} - skipping" + ) continue if not quiet: - logger.info(f'Setting special token type {typ} to {tokid}') + logger.info(f"Setting special token type {typ} to {tokid}") id_handler(tokid) for typ, value in self.add_special_token.items(): - add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) + add_handler: Callable[[bool], None] | None = getattr( + gw, f"add_add_{typ}_token", None + ) if add_handler is None: - logger.warning(f'No handler for add_{typ}_token with value {value} - skipping') + logger.warning( + f"No handler for add_{typ}_token with value {value} - skipping" + ) continue if not quiet: - logger.info(f'Setting add_{typ}_token to {value}') + logger.info(f"Setting add_{typ}_token to {value}") add_handler(value) if self.chat_template is not None: if not quiet: - logger.info(f'Setting chat_template to {self.chat_template}') + logger.info(f"Setting chat_template to {self.chat_template}") gw.add_chat_template(self.chat_template) def _load(self, path: Path) -> None: @@ -102,12 +134,12 @@ def _load(self, path: Path) -> None: self._try_load_merges_txt(path) def _try_load_merges_txt(self, path: Path) -> bool: - merges_file = path / 'merges.txt' + merges_file = path / "merges.txt" if not merges_file.is_file(): return False - with open(merges_file, 'r', encoding = 'utf-8') as fp: - first_line = next(fp, '').strip() - if not first_line.startswith('#'): + with open(merges_file, "r", encoding="utf-8") as fp: + first_line = next(fp, "").strip() + if not first_line.startswith("#"): fp.seek(0) line_num = 0 else: @@ -120,9 +152,11 @@ def _try_load_merges_txt(self, path: Path) -> bool: continue parts = line.split(None, 3) if len(parts) != 2: - logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring') + logger.warning( + f"{merges_file.name}: Line {line_num}: Entry malformed, ignoring" + ) continue - merges.append(f'{parts[0]} {parts[1]}') + merges.append(f"{parts[0]} {parts[1]}") self.merges = merges return True @@ -130,37 +164,45 @@ def _set_special_token(self, typ: str, tid: Any) -> None: if not isinstance(tid, int): return if tid < 0: - raise ValueError(f'invalid value for special token type {typ}: {tid}') + raise ValueError(f"invalid value for special token type {typ}: {tid}") if self.n_vocab is None or tid < self.n_vocab: if typ in self.special_token_ids: return self.special_token_ids[typ] = tid return - logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') + logger.warning( + f"Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping" + ) def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer = None tokenizer_file = path / 'tokenizer.json' if tokenizer_file.is_file(): - with open(tokenizer_file, encoding = 'utf-8') as f: + with open(tokenizer_file, encoding="utf-8") as f: tokenizer = json.load(f) if self.load_merges: - merges = tokenizer.get('model', {}).get('merges') + merges = tokenizer.get("model", {}).get("merges") if isinstance(merges, list) and merges: if isinstance(merges[0], str): self.merges = merges - elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str): + elif ( + isinstance(merges[0], list) + and len(merges[0]) == 2 + and isinstance(merges[0][0], str) + ): # New format since transformers 4.45 to support spaces in merges # ref: https://github.com/ggml-org/llama.cpp/issues/9692 # TODO: internally store as the new format instead of converting to old - if any(' ' in s for pair in merges for s in pair): - logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}') + if any(" " in s for pair in merges for s in pair): + logger.warning( + f"Spaces in merges detected, encoding as {chr(ord(' ') + 256)!r}" + ) self.merges = [ - ' '.join( + " ".join( [ # ensure the spaces are properly encoded - ''.join( - chr(ord(c) + 256) if c == ' ' else c + "".join( + chr(ord(c) + 256) if c == " " else c for c in part ) for part in pair @@ -170,7 +212,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: ] else: raise ValueError("Unknown tokenizer merges format") - added_tokens = tokenizer.get('added_tokens', {}) + added_tokens = tokenizer.get("added_tokens", {}) else: added_tokens = {} tokenizer_config = None @@ -282,16 +324,18 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: if chat_template is None or isinstance(chat_template, (str, list)): self.chat_template = chat_template else: - logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring') + logger.warning( + f"Bad type for chat_template field in {tokenizer_config_file!r} - ignoring" + ) for typ in self.special_token_types: - add_entry = tokenizer_config.get(f'add_{typ}_token') + add_entry = tokenizer_config.get(f"add_{typ}_token") if isinstance(add_entry, bool): self.add_special_token[typ] = add_entry - entry = tokenizer_config.get(f'{typ}_token') + entry = tokenizer_config.get(f"{typ}_token") if isinstance(entry, str): tc_content = entry elif isinstance(entry, dict): - entry_content = entry.get('content') + entry_content = entry.get("content") if not isinstance(entry_content, str): continue tc_content = entry_content @@ -299,20 +343,24 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: continue # We only need the first match here. maybe_token_id = next( - (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content), + ( + atok.get("id") + for atok in added_tokens + if atok.get("content") == tc_content + ), None, ) self._set_special_token(typ, maybe_token_id) return True def _try_load_from_config_json(self, path: Path) -> bool: - config_file = path / 'config.json' + config_file = path / "config.json" if not config_file.is_file(): return False - with open(config_file, encoding = 'utf-8') as f: + with open(config_file, encoding="utf-8") as f: config = json.load(f) for typ in self.special_token_types: - self._set_special_token(typ, config.get(f'{typ}_token_id')) + self._set_special_token(typ, config.get(f"{typ}_token_id")) return True @@ -348,54 +396,59 @@ class BpeVocab(Vocab): def __init__(self, base_path: Path): added_tokens: dict[str, int] = {} - if (fname_tokenizer := base_path / 'vocab.json').exists(): + if (fname_tokenizer := base_path / "vocab.json").exists(): # "slow" tokenizer with open(fname_tokenizer, encoding="utf-8") as f: self.vocab = json.load(f) try: # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. - with open(base_path / 'added_tokens.json', encoding="utf-8") as f: + with open(base_path / "added_tokens.json", encoding="utf-8") as f: added_tokens = json.load(f) except FileNotFoundError: pass else: # "fast" tokenizer - fname_tokenizer = base_path / 'tokenizer.json' + fname_tokenizer = base_path / "tokenizer.json" # if this fails, FileNotFoundError propagates to caller with open(fname_tokenizer, encoding="utf-8") as f: tokenizer_json = json.load(f) - tokenizer_model: dict[str, Any] = tokenizer_json['model'] + tokenizer_model: dict[str, Any] = tokenizer_json["model"] if ( - tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False) - or tokenizer_json['decoder']['type'] != 'ByteLevel' + tokenizer_model["type"] != "BPE" + or tokenizer_model.get("byte_fallback", False) + or tokenizer_json["decoder"]["type"] != "ByteLevel" ): - raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer') + raise FileNotFoundError("Cannot find GPT-2 BPE tokenizer") self.vocab = tokenizer_model["vocab"] - if (added := tokenizer_json.get('added_tokens')) is not None: + if (added := tokenizer_json.get("added_tokens")) is not None: # Added tokens here can be duplicates of the main vocabulary. - added_tokens = {item['content']: item['id'] - for item in added - if item['content'] not in self.vocab} + added_tokens = { + item["content"]: item["id"] + for item in added + if item["content"] not in self.vocab + } - vocab_size = len(self.vocab) + vocab_size = len(self.vocab) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) + actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: expected_end_id = vocab_size + len(actual_ids) - 1 - raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " - f"{vocab_size} - {expected_end_id}; got {actual_ids}") + raise ValueError( + f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " + f"{vocab_size} - {expected_end_id}; got {actual_ids}" + ) items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_dict = added_tokens - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.added_tokens_dict = added_tokens + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} @@ -425,40 +478,44 @@ def __init__(self, base_path: Path): raise RuntimeError("sentencepiece is not installed") added_tokens: dict[str, int] = {} - if (fname_tokenizer := base_path / 'tokenizer.model').exists(): + if (fname_tokenizer := base_path / "tokenizer.model").exists(): # normal location try: - with open(base_path / 'added_tokens.json', encoding="utf-8") as f: + with open(base_path / "added_tokens.json", encoding="utf-8") as f: added_tokens = json.load(f) except FileNotFoundError: pass - elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists(): + elif not (fname_tokenizer := base_path.parent / "tokenizer.model").exists(): # not found in alternate location either - raise FileNotFoundError('Cannot find tokenizer.model') + raise FileNotFoundError("Cannot find tokenizer.model") self.sentencepiece_tokenizer = SentencePieceProcessor() self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer)) vocab_size = self.sentencepiece_tokenizer.vocab_size() - new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} + new_tokens = { + id: piece for piece, id in added_tokens.items() if id >= vocab_size + } expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) + actual_new_ids = sorted(new_tokens.keys()) if expected_new_ids != actual_new_ids: - raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") + raise ValueError( + f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}" + ) # Token pieces that were added to the base vocabulary. - self.added_tokens_dict = added_tokens - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.added_tokens_dict = added_tokens + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer for i in range(tokenizer.vocab_size()): piece = tokenizer.IdToPiece(i) - text = piece.encode("utf-8") + text = piece.encode("utf-8") score: float = tokenizer.GetScore(i) toktype = gguf.TokenType.NORMAL @@ -496,25 +553,27 @@ class LlamaHfVocab(Vocab): name = "hfft" def __init__(self, base_path: Path): - fname_tokenizer = base_path / 'tokenizer.json' + fname_tokenizer = base_path / "tokenizer.json" # if this fails, FileNotFoundError propagates to caller - with open(fname_tokenizer, encoding='utf-8') as f: + with open(fname_tokenizer, encoding="utf-8") as f: tokenizer_json = json.load(f) # pre-check so we know if we need transformers - tokenizer_model: dict[str, Any] = tokenizer_json['model'] + tokenizer_model: dict[str, Any] = tokenizer_json["model"] is_llama3 = ( - tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False) - and not tokenizer_model.get('byte_fallback', True) + tokenizer_model["type"] == "BPE" + and tokenizer_model.get("ignore_merges", False) + and not tokenizer_model.get("byte_fallback", True) ) if is_llama3: - raise TypeError('Llama 3 must be converted with BpeVocab') + raise TypeError("Llama 3 must be converted with BpeVocab") if not is_llama3 and ( - tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) - or tokenizer_json['decoder']['type'] != 'Sequence' + tokenizer_model["type"] != "BPE" + or not tokenizer_model.get("byte_fallback", False) + or tokenizer_json["decoder"]["type"] != "Sequence" ): - raise FileNotFoundError('Cannot find Llama BPE tokenizer') + raise FileNotFoundError("Cannot find Llama BPE tokenizer") try: from transformers import AutoTokenizer @@ -536,7 +595,7 @@ def __init__(self, base_path: Path): # Initialize lists and dictionaries for added tokens self.added_tokens_list = [] self.added_tokens_dict = dict() - self.added_tokens_ids = set() + self.added_tokens_ids = set() # Process added tokens for tok, tokidx in sorted( @@ -557,7 +616,7 @@ def __init__(self, base_path: Path): # Set vocabulary sizes self.vocab_size_base = self.tokenizer.vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer @@ -575,17 +634,27 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: token_text = reverse_vocab[token_id].encode("utf-8") # Yield token text, score, and type - yield token_text, self.get_token_score(token_id), self.get_token_type( - token_id, token_text, self.special_ids # Reuse already stored special IDs + yield ( + token_text, + self.get_token_score(token_id), + self.get_token_type( + token_id, + token_text, + self.special_ids, # Reuse already stored special IDs + ), ) - def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: + def get_token_type( + self, token_id: int, token_text: bytes, special_ids: set[int] + ) -> gguf.TokenType: # Special case for byte tokens - if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + if re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): return gguf.TokenType.BYTE # Determine token type based on whether it's a special token - return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL + return ( + gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL + ) def get_token_score(self, token_id: int) -> float: # Placeholder for actual logic to determine the token's score @@ -595,7 +664,9 @@ def get_token_score(self, token_id: int) -> float: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type(self.specials[text], b'', self.special_ids) + toktype = self.get_token_type( + self.specials[text], b"", self.special_ids + ) score = self.get_token_score(self.specials[text]) else: toktype = gguf.TokenType.USER_DEFINED diff --git a/pyproject.toml b/pyproject.toml index 3d71b055a8dbf..69ea98c1dbb8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,4 +42,5 @@ build-backend = "poetry.core.masonry.api" llama-convert-hf-to-gguf = "convert_hf_to_gguf:main" llama-convert-lora-to-gguf = "convert_lora_to_gguf:main" llama-convert-llama-ggml-to-gguf = "convert_llama_ggml_to_gguf:main" +llama-convert-mistral-to-gguf = "convert_mistral_to_gguf:main" llama-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main" diff --git a/requirements.txt b/requirements.txt index f2a18d62879b4..9120254ca1f49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ -r ./requirements/requirements-convert_hf_to_gguf_update.txt -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt -r ./requirements/requirements-convert_lora_to_gguf.txt +-r ./requirements/requirements-convert_mistral_to_gguf.txt -r ./requirements/requirements-tool_bench.txt diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 56b6752ac0645..dc0b83d09b8fc 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -11,6 +11,7 @@ -r ./requirements-convert_hf_to_gguf_update.txt -r ./requirements-convert_legacy_llama.txt -r ./requirements-convert_llama_ggml_to_gguf.txt +-r ./requirements-convert_mistral_to_gguf.txt -r ./requirements-tool_bench.txt -r ./requirements-gguf_editor_gui.txt diff --git a/requirements/requirements-convert_mistral_to_gguf.txt b/requirements/requirements-convert_mistral_to_gguf.txt new file mode 100644 index 0000000000000..5616161201eef --- /dev/null +++ b/requirements/requirements-convert_mistral_to_gguf.txt @@ -0,0 +1,13 @@ +numpy<2.0.0 +gguf>=0.1.0 +protobuf>=4.21.0,<5.0.0 +mistral-common>=1.8.0 +safetensors>=0.5.3 +huggingface_hub>=0.23.2 + +--extra-index-url https://download.pytorch.org/whl/cpu +torch~=2.2.1; platform_machine != "s390x" + +# torch s390x packages can only be found from nightly builds +--extra-index-url https://download.pytorch.org/whl/nightly +torch>=0.0.0.dev0; platform_machine == "s390x" From b7e6e134c420381be4654953704756d9522b6a61 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Thu, 24 Jul 2025 14:22:07 +0200 Subject: [PATCH 02/19] Revert changes and fix gguf --- convert_mistral_to_gguf.py | 8 +- gguf-py/gguf/vocab.py | 264 +++++++++++++------------------------ 2 files changed, 96 insertions(+), 176 deletions(-) diff --git a/convert_mistral_to_gguf.py b/convert_mistral_to_gguf.py index 3da712f82a566..e763630e39ff1 100755 --- a/convert_mistral_to_gguf.py +++ b/convert_mistral_to_gguf.py @@ -24,6 +24,10 @@ import numpy as np import torch +if "NO_LOCAL_GGUF" not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) + +import gguf from gguf.constants import MODEL_ARCH, MODEL_ARCH_NAMES from gguf.vocab import MistralTokenizerType, MistralVocab from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD @@ -31,10 +35,6 @@ if TYPE_CHECKING: from torch import Tensor -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) -import gguf - logger = logging.getLogger("mistral-to-gguf") diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 65916b05d9cb6..f531990532102 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -6,16 +6,7 @@ import json import os from pathlib import Path -from typing import ( - Any, - Callable, - Sequence, - Mapping, - Iterable, - Protocol, - ClassVar, - runtime_checkable -) +from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable try: from sentencepiece import SentencePieceProcessor @@ -45,7 +36,6 @@ from .gguf_writer import GGUFWriter - logger = logging.getLogger(__name__) @@ -56,9 +46,7 @@ class SpecialVocab: chat_template: str | Sequence[Mapping[str, str]] | None def __init__( - self, - path: str | os.PathLike[str], - load_merges: bool = False, + self, path: str | os.PathLike[str], load_merges: bool = False, special_token_types: Iterable[str] | None = None, n_vocab: int | None = None, ): @@ -71,60 +59,40 @@ def __init__( if special_token_types is not None: self.special_token_types = special_token_types else: - self.special_token_types = ( - "bos", - "eos", - "unk", - "sep", - "pad", - "cls", - "mask", - ) + self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask') self._load(Path(path)) def __repr__(self) -> str: - return "".format( - len(self.merges), - self.special_token_ids or "unset", - self.add_special_token or "unset", + return ''.format( + len(self.merges), self.special_token_ids or "unset", self.add_special_token or "unset", ) def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if self.merges: if not quiet: - logger.info(f"Adding {len(self.merges)} merge(s).") + logger.info(f'Adding {len(self.merges)} merge(s).') gw.add_token_merges(self.merges) elif self.load_merges: - logger.warning( - "Adding merges requested but no merges found, output may be non-functional." - ) + logger.warning('Adding merges requested but no merges found, output may be non-functional.') for typ, tokid in self.special_token_ids.items(): - id_handler: Callable[[int], None] | None = getattr( - gw, f"add_{typ}_token_id", None - ) + id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) if id_handler is None: - logger.warning( - f"No handler for special token type {typ} with id {tokid} - skipping" - ) + logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping') continue if not quiet: - logger.info(f"Setting special token type {typ} to {tokid}") + logger.info(f'Setting special token type {typ} to {tokid}') id_handler(tokid) for typ, value in self.add_special_token.items(): - add_handler: Callable[[bool], None] | None = getattr( - gw, f"add_add_{typ}_token", None - ) + add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) if add_handler is None: - logger.warning( - f"No handler for add_{typ}_token with value {value} - skipping" - ) + logger.warning(f'No handler for add_{typ}_token with value {value} - skipping') continue if not quiet: - logger.info(f"Setting add_{typ}_token to {value}") + logger.info(f'Setting add_{typ}_token to {value}') add_handler(value) if self.chat_template is not None: if not quiet: - logger.info(f"Setting chat_template to {self.chat_template}") + logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) def _load(self, path: Path) -> None: @@ -134,12 +102,12 @@ def _load(self, path: Path) -> None: self._try_load_merges_txt(path) def _try_load_merges_txt(self, path: Path) -> bool: - merges_file = path / "merges.txt" + merges_file = path / 'merges.txt' if not merges_file.is_file(): return False - with open(merges_file, "r", encoding="utf-8") as fp: - first_line = next(fp, "").strip() - if not first_line.startswith("#"): + with open(merges_file, 'r', encoding = 'utf-8') as fp: + first_line = next(fp, '').strip() + if not first_line.startswith('#'): fp.seek(0) line_num = 0 else: @@ -152,11 +120,9 @@ def _try_load_merges_txt(self, path: Path) -> bool: continue parts = line.split(None, 3) if len(parts) != 2: - logger.warning( - f"{merges_file.name}: Line {line_num}: Entry malformed, ignoring" - ) + logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring') continue - merges.append(f"{parts[0]} {parts[1]}") + merges.append(f'{parts[0]} {parts[1]}') self.merges = merges return True @@ -164,45 +130,37 @@ def _set_special_token(self, typ: str, tid: Any) -> None: if not isinstance(tid, int): return if tid < 0: - raise ValueError(f"invalid value for special token type {typ}: {tid}") + raise ValueError(f'invalid value for special token type {typ}: {tid}') if self.n_vocab is None or tid < self.n_vocab: if typ in self.special_token_ids: return self.special_token_ids[typ] = tid return - logger.warning( - f"Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping" - ) + logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer = None tokenizer_file = path / 'tokenizer.json' if tokenizer_file.is_file(): - with open(tokenizer_file, encoding="utf-8") as f: + with open(tokenizer_file, encoding = 'utf-8') as f: tokenizer = json.load(f) if self.load_merges: - merges = tokenizer.get("model", {}).get("merges") + merges = tokenizer.get('model', {}).get('merges') if isinstance(merges, list) and merges: if isinstance(merges[0], str): self.merges = merges - elif ( - isinstance(merges[0], list) - and len(merges[0]) == 2 - and isinstance(merges[0][0], str) - ): + elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str): # New format since transformers 4.45 to support spaces in merges # ref: https://github.com/ggml-org/llama.cpp/issues/9692 # TODO: internally store as the new format instead of converting to old - if any(" " in s for pair in merges for s in pair): - logger.warning( - f"Spaces in merges detected, encoding as {chr(ord(' ') + 256)!r}" - ) + if any(' ' in s for pair in merges for s in pair): + logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}') self.merges = [ - " ".join( + ' '.join( [ # ensure the spaces are properly encoded - "".join( - chr(ord(c) + 256) if c == " " else c + ''.join( + chr(ord(c) + 256) if c == ' ' else c for c in part ) for part in pair @@ -212,7 +170,7 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: ] else: raise ValueError("Unknown tokenizer merges format") - added_tokens = tokenizer.get("added_tokens", {}) + added_tokens = tokenizer.get('added_tokens', {}) else: added_tokens = {} tokenizer_config = None @@ -307,35 +265,24 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: if not tokenizer_config: return True chat_template_alt = None - chat_template_json = path / 'chat_template.json' - chat_template_jinja = path / 'chat_template.jinja' - if chat_template_jinja.is_file(): - with open(chat_template_jinja, encoding = 'utf-8') as f: - chat_template_alt = f.read() - if additional_templates := list((path / 'additional_chat_templates').glob('*.jinja')): - chat_template_alt = [{'name': 'default', 'template': chat_template_alt}] - for template_path in additional_templates: - with open(template_path, encoding = 'utf-8') as fp: - chat_template_alt.append({'name': template_path.stem, 'template': fp.read()}) - elif chat_template_json.is_file(): - with open(chat_template_json, encoding = 'utf-8') as f: + chat_template_file = path / 'chat_template.json' + if chat_template_file.is_file(): + with open(chat_template_file, encoding = 'utf-8') as f: chat_template_alt = json.load(f).get('chat_template') chat_template = tokenizer_config.get('chat_template', chat_template_alt) if chat_template is None or isinstance(chat_template, (str, list)): self.chat_template = chat_template else: - logger.warning( - f"Bad type for chat_template field in {tokenizer_config_file!r} - ignoring" - ) + logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring') for typ in self.special_token_types: - add_entry = tokenizer_config.get(f"add_{typ}_token") + add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): self.add_special_token[typ] = add_entry - entry = tokenizer_config.get(f"{typ}_token") + entry = tokenizer_config.get(f'{typ}_token') if isinstance(entry, str): tc_content = entry elif isinstance(entry, dict): - entry_content = entry.get("content") + entry_content = entry.get('content') if not isinstance(entry_content, str): continue tc_content = entry_content @@ -343,24 +290,20 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: continue # We only need the first match here. maybe_token_id = next( - ( - atok.get("id") - for atok in added_tokens - if atok.get("content") == tc_content - ), + (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content), None, ) self._set_special_token(typ, maybe_token_id) return True def _try_load_from_config_json(self, path: Path) -> bool: - config_file = path / "config.json" + config_file = path / 'config.json' if not config_file.is_file(): return False - with open(config_file, encoding="utf-8") as f: + with open(config_file, encoding = 'utf-8') as f: config = json.load(f) for typ in self.special_token_types: - self._set_special_token(typ, config.get(f"{typ}_token_id")) + self._set_special_token(typ, config.get(f'{typ}_token_id')) return True @@ -396,59 +339,54 @@ class BpeVocab(Vocab): def __init__(self, base_path: Path): added_tokens: dict[str, int] = {} - if (fname_tokenizer := base_path / "vocab.json").exists(): + if (fname_tokenizer := base_path / 'vocab.json').exists(): # "slow" tokenizer with open(fname_tokenizer, encoding="utf-8") as f: self.vocab = json.load(f) try: # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. - with open(base_path / "added_tokens.json", encoding="utf-8") as f: + with open(base_path / 'added_tokens.json', encoding="utf-8") as f: added_tokens = json.load(f) except FileNotFoundError: pass else: # "fast" tokenizer - fname_tokenizer = base_path / "tokenizer.json" + fname_tokenizer = base_path / 'tokenizer.json' # if this fails, FileNotFoundError propagates to caller with open(fname_tokenizer, encoding="utf-8") as f: tokenizer_json = json.load(f) - tokenizer_model: dict[str, Any] = tokenizer_json["model"] + tokenizer_model: dict[str, Any] = tokenizer_json['model'] if ( - tokenizer_model["type"] != "BPE" - or tokenizer_model.get("byte_fallback", False) - or tokenizer_json["decoder"]["type"] != "ByteLevel" + tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'ByteLevel' ): - raise FileNotFoundError("Cannot find GPT-2 BPE tokenizer") + raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer') self.vocab = tokenizer_model["vocab"] - if (added := tokenizer_json.get("added_tokens")) is not None: + if (added := tokenizer_json.get('added_tokens')) is not None: # Added tokens here can be duplicates of the main vocabulary. - added_tokens = { - item["content"]: item["id"] - for item in added - if item["content"] not in self.vocab - } + added_tokens = {item['content']: item['id'] + for item in added + if item['content'] not in self.vocab} - vocab_size = len(self.vocab) + vocab_size = len(self.vocab) expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) + actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: expected_end_id = vocab_size + len(actual_ids) - 1 - raise ValueError( - f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " - f"{vocab_size} - {expected_end_id}; got {actual_ids}" - ) + raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " + f"{vocab_size} - {expected_end_id}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) - self.added_tokens_dict = added_tokens - self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.added_tokens_dict = added_tokens + self.added_tokens_list = [text for (text, idx) in items] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} @@ -478,44 +416,40 @@ def __init__(self, base_path: Path): raise RuntimeError("sentencepiece is not installed") added_tokens: dict[str, int] = {} - if (fname_tokenizer := base_path / "tokenizer.model").exists(): + if (fname_tokenizer := base_path / 'tokenizer.model').exists(): # normal location try: - with open(base_path / "added_tokens.json", encoding="utf-8") as f: + with open(base_path / 'added_tokens.json', encoding="utf-8") as f: added_tokens = json.load(f) except FileNotFoundError: pass - elif not (fname_tokenizer := base_path.parent / "tokenizer.model").exists(): + elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists(): # not found in alternate location either - raise FileNotFoundError("Cannot find tokenizer.model") + raise FileNotFoundError('Cannot find tokenizer.model') self.sentencepiece_tokenizer = SentencePieceProcessor() self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer)) vocab_size = self.sentencepiece_tokenizer.vocab_size() - new_tokens = { - id: piece for piece, id in added_tokens.items() if id >= vocab_size - } + new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) - actual_new_ids = sorted(new_tokens.keys()) + actual_new_ids = sorted(new_tokens.keys()) if expected_new_ids != actual_new_ids: - raise ValueError( - f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}" - ) + raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") # Token pieces that were added to the base vocabulary. - self.added_tokens_dict = added_tokens - self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] - self.vocab_size_base = vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer + self.added_tokens_dict = added_tokens + self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.fname_tokenizer = fname_tokenizer def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer for i in range(tokenizer.vocab_size()): piece = tokenizer.IdToPiece(i) - text = piece.encode("utf-8") + text = piece.encode("utf-8") score: float = tokenizer.GetScore(i) toktype = gguf.TokenType.NORMAL @@ -553,27 +487,25 @@ class LlamaHfVocab(Vocab): name = "hfft" def __init__(self, base_path: Path): - fname_tokenizer = base_path / "tokenizer.json" + fname_tokenizer = base_path / 'tokenizer.json' # if this fails, FileNotFoundError propagates to caller - with open(fname_tokenizer, encoding="utf-8") as f: + with open(fname_tokenizer, encoding='utf-8') as f: tokenizer_json = json.load(f) # pre-check so we know if we need transformers - tokenizer_model: dict[str, Any] = tokenizer_json["model"] + tokenizer_model: dict[str, Any] = tokenizer_json['model'] is_llama3 = ( - tokenizer_model["type"] == "BPE" - and tokenizer_model.get("ignore_merges", False) - and not tokenizer_model.get("byte_fallback", True) + tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False) + and not tokenizer_model.get('byte_fallback', True) ) if is_llama3: - raise TypeError("Llama 3 must be converted with BpeVocab") + raise TypeError('Llama 3 must be converted with BpeVocab') if not is_llama3 and ( - tokenizer_model["type"] != "BPE" - or not tokenizer_model.get("byte_fallback", False) - or tokenizer_json["decoder"]["type"] != "Sequence" + tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'Sequence' ): - raise FileNotFoundError("Cannot find Llama BPE tokenizer") + raise FileNotFoundError('Cannot find Llama BPE tokenizer') try: from transformers import AutoTokenizer @@ -595,7 +527,7 @@ def __init__(self, base_path: Path): # Initialize lists and dictionaries for added tokens self.added_tokens_list = [] self.added_tokens_dict = dict() - self.added_tokens_ids = set() + self.added_tokens_ids = set() # Process added tokens for tok, tokidx in sorted( @@ -616,7 +548,7 @@ def __init__(self, base_path: Path): # Set vocabulary sizes self.vocab_size_base = self.tokenizer.vocab_size - self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer @@ -634,27 +566,17 @@ def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: token_text = reverse_vocab[token_id].encode("utf-8") # Yield token text, score, and type - yield ( - token_text, - self.get_token_score(token_id), - self.get_token_type( - token_id, - token_text, - self.special_ids, # Reuse already stored special IDs - ), + yield token_text, self.get_token_score(token_id), self.get_token_type( + token_id, token_text, self.special_ids # Reuse already stored special IDs ) - def get_token_type( - self, token_id: int, token_text: bytes, special_ids: set[int] - ) -> gguf.TokenType: + def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType: # Special case for byte tokens - if re.fullmatch(rb"<0x[0-9A-Fa-f]{2}>", token_text): + if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): return gguf.TokenType.BYTE # Determine token type based on whether it's a special token - return ( - gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL - ) + return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL def get_token_score(self, token_id: int) -> float: # Placeholder for actual logic to determine the token's score @@ -664,9 +586,7 @@ def get_token_score(self, token_id: int) -> float: def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: for text in self.added_tokens_list: if text in self.specials: - toktype = self.get_token_type( - self.specials[text], b"", self.special_ids - ) + toktype = self.get_token_type(self.specials[text], b'', self.special_ids) score = self.get_token_score(self.specials[text]) else: toktype = gguf.TokenType.USER_DEFINED From 1be14583ae3d0d0967fa60097c23ceefc4e56fab Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Thu, 24 Jul 2025 14:24:46 +0200 Subject: [PATCH 03/19] Revert change --- gguf-py/gguf/vocab.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index f531990532102..e1d5aaf47ac46 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -265,9 +265,18 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: if not tokenizer_config: return True chat_template_alt = None - chat_template_file = path / 'chat_template.json' - if chat_template_file.is_file(): - with open(chat_template_file, encoding = 'utf-8') as f: + chat_template_json = path / 'chat_template.json' + chat_template_jinja = path / 'chat_template.jinja' + if chat_template_jinja.is_file(): + with open(chat_template_jinja, encoding = 'utf-8') as f: + chat_template_alt = f.read() + if additional_templates := list((path / 'additional_chat_templates').glob('*.jinja')): + chat_template_alt = [{'name': 'default', 'template': chat_template_alt}] + for template_path in additional_templates: + with open(template_path, encoding = 'utf-8') as fp: + chat_template_alt.append({'name': template_path.stem, 'template': fp.read()}) + elif chat_template_json.is_file(): + with open(chat_template_json, encoding = 'utf-8') as f: chat_template_alt = json.load(f).get('chat_template') chat_template = tokenizer_config.get('chat_template', chat_template_alt) if chat_template is None or isinstance(chat_template, (str, list)): From 950eb7381a693b0ee587ad7657819197b9b4f800 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Mon, 28 Jul 2025 10:17:50 +0200 Subject: [PATCH 04/19] refactor convert_mistral_to_gguf.py in convert_hf_to_gguf.py --- convert_hf_to_gguf.py | 335 ++++- convert_mistral_to_gguf.py | 1118 ----------------- gguf-py/gguf/utility.py | 10 +- pyproject.toml | 1 - requirements.txt | 1 - requirements/requirements-all.txt | 1 - .../requirements-convert_hf_to_gguf.txt | 1 + .../requirements-convert_mistral_to_gguf.txt | 13 - 8 files changed, 297 insertions(+), 1183 deletions(-) delete mode 100755 convert_mistral_to_gguf.py delete mode 100644 requirements/requirements-convert_mistral_to_gguf.txt diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b1d2d09dec50a..0b02f37874efc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -28,6 +28,13 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +from gguf.constants import MODEL_ARCH, MODEL_ARCH_NAMES +from gguf.vocab import MistralTokenizerType, MistralVocab +from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD + +if TYPE_CHECKING: + from torch import Tensor + logger = logging.getLogger("hf-to-gguf") @@ -66,6 +73,7 @@ class ModelBase: lazy: bool part_names: list[str] is_safetensors: bool + is_mistral_format: bool hparams: dict[str, Any] tensor_names: set[str] | None gguf_writer: gguf.GGUFWriter @@ -85,12 +93,14 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, - small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): + small_first_shard: bool = False, hparams: dict[str, Any] | None = None, + remote_hf_model_id: str | None = None, n_ctx: int = 0, is_mistral_format: bool = False): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out @@ -99,6 +109,12 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.use_temp_file = use_temp_file self.lazy = not eager or (remote_hf_model_id is not None) self.remote_hf_model_id = remote_hf_model_id + self.n_ctx = n_ctx + self.is_mistral_format = is_mistral_format + + if is_mistral_format and not n_ctx: + raise ValueError("Please pass the context length using --ctx when using mistral formats.") + if remote_hf_model_id is not None: self.is_safetensors = True @@ -111,11 +127,12 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: self.get_tensors = get_remote_tensors else: - self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors") + prefix = "model" if not is_mistral_format else "consolidated" + self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams + self.hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format) if hparams is None else hparams self.tensor_names = None self.metadata_override = metadata_override self.model_name = model_name @@ -153,19 +170,23 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" - index_name += ".index.json" - index_file = self.dir_model / index_name - - if index_file.is_file(): - self.tensor_names = set() - logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(index_file, "r", encoding="utf-8") as f: - index: dict[str, Any] = json.load(f) - weight_map = index.get("weight_map") - if weight_map is None or not isinstance(weight_map, dict): - raise ValueError(f"Can't load 'weight_map' from {index_name!r}") - self.tensor_names.update(weight_map.keys()) + if not self.is_mistral_format: + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + index_file = self.dir_model / index_name + + if index_file.is_file(): + self.tensor_names = set() + logger.info(f"gguf: loading model weight map from '{index_name}'") + with open(index_file, "r", encoding="utf-8") as f: + index: dict[str, Any] = json.load(f) + weight_map = index.get("weight_map") + if weight_map is None or not isinstance(weight_map, dict): + raise ValueError(f"Can't load 'weight_map' from {index_name!r}") + self.tensor_names.update(weight_map.keys()) + else: + self.tensor_names = tensor_names_from_parts + weight_map = {} else: self.tensor_names = tensor_names_from_parts weight_map = {} @@ -275,6 +296,14 @@ def prepare_tensors(self): break for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): + # hard coded for pixtral + if name == "vision_language_adapter.w_in.weight": + assert new_name == "mm.23.weight", new_name + new_name = "mm.1.weight" + elif name == "vision_language_adapter.w_out.weight": + assert new_name == "mm.23.weight", new_name + new_name = "mm.2.weight" + # TODO: why do we squeeze here? # data = data_torch.squeeze().numpy() data = data_torch.numpy() @@ -426,7 +455,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] return part_names @staticmethod - def load_hparams(dir_model: Path): + def load_hparams(dir_model: Path, is_mistral_format: bool): + if is_mistral_format: + with open(dir_model / "params.json", "r", encoding="utf-8") as f: + config = json.load(f) + return config + try: # for security reason, we don't allow loading remote code by default # if a model need remote code, we will fallback to config.json @@ -476,7 +510,10 @@ class TextModel(ModelBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.hf_arch = get_model_architecture(self.hparams, self.model_type) + if not self.is_mistral_format: + self.hf_arch = get_model_architecture(self.hparams, self.model_type) + else: + self.hf_arch = "" if "text_config" in self.hparams: # move the text_config to the root level @@ -493,7 +530,10 @@ def __init_subclass__(cls): raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") def set_vocab(self): - self._set_vocab_gpt2() + if self.is_mistral_format: + self._set_vocab_mistral() + else: + self._set_vocab_gpt2() def prepare_metadata(self, vocab_only: bool): super().prepare_metadata(vocab_only=vocab_only) @@ -526,7 +566,12 @@ def prepare_metadata(self, vocab_only: bool): def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) - if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None: + if self.is_mistral_format: + n_ctx = self.n_ctx + else: + n_ctx = self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True) + + if n_ctx is not None: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") @@ -542,14 +587,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(n_head) logger.info(f"gguf: head count = {n_head}") - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) logger.info(f"gguf: key-value head count = {n_head_kv}") if (rope_theta := self.hparams.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") - if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"])) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: @@ -870,6 +915,50 @@ def get_vocab_base_pre(self, tokenizer) -> str: def _set_vocab_none(self) -> None: self.gguf_writer.add_tokenizer_model("none") + + def _set_vocab_mistral(self): + vocab = MistralVocab(self.dir_model) + logger.info( + f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." + ) + + self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) + + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size, ( + f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" + ) + + if vocab.tokenizer_type == MistralTokenizerType.tekken: + self.gguf_writer.add_tokenizer_pre("tekken") + self.gguf_writer.add_token_merges( + vocab.extract_vocab_merges_from_model() + ) + + logger.info( + f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." + ) + + self.gguf_writer.add_bos_token_id(vocab.bos_id) + self.gguf_writer.add_eos_token_id(vocab.eos_id) + self.gguf_writer.add_unk_token_id(vocab.unk_id) + self.gguf_writer.add_pad_token_id(vocab.pad_id) + + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_vocab_size(vocab.vocab_size) + + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(False) def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() @@ -1198,12 +1287,19 @@ def __init__(self, *args, **kwargs): raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ") # get n_embd of the text model - if "text_config" not in self.hparams: - self.hparams["text_config"] = {} - if "audio_config" not in self.hparams: - self.hparams["audio_config"] = {} - text_config = {**self.hparams, **self.hparams["text_config"]} - self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) + if not self.is_mistral_format: + if "text_config" not in self.hparams: + self.hparams["text_config"] = {} + if "audio_config" not in self.hparams: + self.hparams["audio_config"] = {} + text_config = {**self.hparams, **self.hparams["text_config"]} + self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) + else: + text_config = { + k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"] + } + self.n_embd_text = text_config.get("hidden_dim", 0) + assert self.n_embd_text > 0, "n_embd not found in hparams" # move vision config to the top level, while preserving the original hparams in global_config @@ -1224,11 +1320,13 @@ def __init__(self, *args, **kwargs): self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) # load preprocessor config - with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: - self.preprocessor_config = json.load(f) + if not self.is_mistral_format: + with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: + self.preprocessor_config = json.load(f) def get_vision_config(self) -> dict[str, Any] | None: - return self.global_config.get("vision_config") + config_name = "vision_config" if not self.is_mistral_format else "vision_encoder" + return self.global_config.get(config_name) def get_audio_config(self) -> dict[str, Any] | None: return self.global_config.get("audio_config") @@ -1252,8 +1350,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"])) # preprocessor config - self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) - self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"]) + image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] + image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] + + self.gguf_writer.add_vision_image_mean(image_mean) + self.gguf_writer.add_vision_image_std(image_std) if self.has_audio_encoder: self.gguf_writer.add_clip_has_audio_encoder(True) @@ -4181,7 +4282,7 @@ class BertModel(TextModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.vocab_size = None + vocab_size = None if cls_out_labels := self.hparams.get("id2label"): if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0": @@ -4199,7 +4300,7 @@ def set_gguf_parameters(self): def set_vocab(self): tokens, toktypes, tokpre = self.get_vocab_base() - self.vocab_size = len(tokens) + vocab_size = len(tokens) # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings @@ -7737,6 +7838,132 @@ def prepare_tensors(self): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") + +class MistralModel(TextModel): + model_name = "mistral" + model_arch = MODEL_ARCH.LLAMA + undo_permute = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + rope_scaling = self.hparams.get("rope_scaling") or {} + if ( + rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" + and "factor" in rope_scaling + ): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return ( + weights.reshape( + n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] + ) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["n_heads"] + n_kv_head = self.hparams.get("n_kv_heads") + is_vision_tensor = any( + name.startswith(prefix) + for prefix in [ + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ] + ) + + if is_vision_tensor: + return [] # skip vision tensors + + if self.undo_permute: + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = self.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = self.permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + + +class PixtralModel(MmprojModel): + model_name = "mistral" + img_break_tok_id = -1 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py + self.hparams["layer_norm_eps"] = self.hparams.get("norm_eps", 1e-5) + self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) + logger.info(f"Image break token id: {self.img_break_tok_id}") + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) + + self.gguf_writer.add_vision_attention_layernorm_eps( + self.find_hparam(["layer_norm_eps"]) + ) + self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"])) + + self.gguf_writer.add_vision_use_silu(True) + + # spatial_merge_size + if self.find_vparam(["mm_projector_id"]) == "patch_merge": + self.gguf_writer.add_vision_spatial_merge_size( + self.find_vparam(["spatial_merge_size"]) + ) + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + del bid # unused + n_head = self.find_vparam(["num_attention_heads"]) + n_kv_head = n_head + + if any( + name.startswith(prefix) + for prefix in [ + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ] + ): + # process vision tensors + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = MistralModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = MistralModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + if self.img_break_tok_id > 0 and "tok_embeddings.weight" in name: + logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") + # for pixtral model, we need to extract the [IMG_BREAK] token embedding + img_break_embd = data_torch[self.img_break_tok_id] + name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] + return [(self.map_tensor_name(name), img_break_embd)] + + return [] # skip other tensors + ###### CONVERSION LOGIC ###### @@ -7886,6 +8113,16 @@ def parse_args() -> argparse.Namespace: "--mmproj", action="store_true", help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", ) + parser.add_argument( + "--mistral-format", action="store_true", + help="Whether the model is stored following the Mistral format.", + ) + parser.add_argument( + "--n-ctx", + type=int, + help="Training context size", + default=0 + ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -7990,18 +8227,25 @@ def main() -> None: if args.mmproj: if "mmproj" not in fname_out.name: fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") + + is_mistral_format = args.mistral_format with torch.inference_mode(): output_type = ftype_map[args.outtype] model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT - hparams = ModelBase.load_hparams(dir_model) - model_architecture = get_model_architecture(hparams, model_type) - logger.info(f"Model architecture: {model_architecture}") - try: - model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) - except NotImplementedError: - logger.error(f"Model {model_architecture} is not supported") - sys.exit(1) + hparams = ModelBase.load_hparams(dir_model, is_mistral_format) + if not is_mistral_format: + model_architecture = get_model_architecture(hparams, model_type) + logger.info(f"Model architecture: {model_architecture}") + try: + model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) + except NotImplementedError: + logger.error(f"Model {model_architecture} is not supported") + sys.exit(1) + elif args.mmproj and hparams.get("vision_encoder"): + model_class = PixtralModel + else: + model_class = MistralModel model_instance = model_class(dir_model, output_type, fname_out, is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, @@ -8010,7 +8254,10 @@ def main() -> None: split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, - remote_hf_model_id=hf_repo_id) + remote_hf_model_id=hf_repo_id, + n_ctx=args.n_ctx, + is_mistral_format=is_mistral_format + ) if args.vocab_only: logger.info("Exporting model vocab...") diff --git a/convert_mistral_to_gguf.py b/convert_mistral_to_gguf.py deleted file mode 100755 index e763630e39ff1..0000000000000 --- a/convert_mistral_to_gguf.py +++ /dev/null @@ -1,1118 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -from __future__ import annotations - -import logging -import argparse -import json -import os -import sys -from enum import IntEnum -from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - ContextManager, - Iterable, - Iterator, - Sequence, - Type, - cast, -) - -import numpy as np -import torch - -if "NO_LOCAL_GGUF" not in os.environ: - sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) - -import gguf -from gguf.constants import MODEL_ARCH, MODEL_ARCH_NAMES -from gguf.vocab import MistralTokenizerType, MistralVocab -from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD - -if TYPE_CHECKING: - from torch import Tensor - -logger = logging.getLogger("mistral-to-gguf") - - -###### MODEL DEFINITIONS ###### - - -class SentencePieceTokenTypes(IntEnum): - NORMAL = 1 - UNKNOWN = 2 - CONTROL = 3 - USER_DEFINED = 4 - UNUSED = 5 - BYTE = 6 - - -class ModelType(IntEnum): - TEXT = 1 - MMPROJ = 2 - - -class ModelBase: - dir_model: Path - ftype: gguf.LlamaFileType - fname_out: Path - is_big_endian: bool - endianess: gguf.GGUFEndian - use_temp_file: bool - lazy: bool - hparams: dict[str, Any] - tensor_names: set[str] | None - gguf_writer: gguf.GGUFWriter - model_name: str | None - metadata_override: Path | None - dir_model_card: Path - remote_hf_model_id: str | None - model_arch: MODEL_ARCH - model_type: ModelType - - # subclasses should initialize this! - block_count: int - tensor_map: gguf.TensorNameMap - - def __init__( - self, - dir_model: Path, - ftype: gguf.LlamaFileType, - fname_out: Path, - *, - is_big_endian: bool = False, - use_temp_file: bool = False, - eager: bool = False, - metadata_override: Path | None = None, - model_name: str | None = None, - split_max_tensors: int = 0, - split_max_size: int = 0, - dry_run: bool = False, - small_first_shard: bool = False, - hparams: dict[str, Any] | None = None, - remote_hf_model_id: str | None = None, - ctx: int = 0, - ): - if ( - type(self) is ModelBase - or type(self) is TextModel - or type(self) is MmprojModel - ): - raise TypeError( - f"{type(self).__name__!r} should not be directly instantiated" - ) - - self.ctx = ctx - self.dir_model = dir_model - self.ftype = ftype - self.fname_out = fname_out - self.is_big_endian = is_big_endian - self.endianess = ( - gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - ) - self.use_temp_file = use_temp_file - self.lazy = not eager or (remote_hf_model_id is not None) - self.remote_hf_model_id = remote_hf_model_id - self.vocab = MistralVocab(self.dir_model) - if remote_hf_model_id is not None: - - def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: - logger.info( - f"Using remote model with HuggingFace id: {remote_hf_model_id}" - ) - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_model( - remote_hf_model_id - ) - self.tensor_names = set(name for name in remote_tensors.keys()) - for ( - name, - remote_tensor, - ) in gguf.utility.SafetensorRemote.get_list_tensors_model( - remote_hf_model_id - ).items(): - yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) - - self.get_tensors = get_remote_tensors - - self.hparams = ( - ModelBase.load_hparams(self.dir_model) if hparams is None else hparams - ) - self.tensor_names = None - self.metadata_override = metadata_override - self.model_name = model_name - self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py - - # Apply heuristics to figure out typical tensor encoding based on first layer tensor encoding type - if self.ftype == gguf.LlamaFileType.GUESSED: - _, first_tensor = next(self.get_tensors()) - if first_tensor.dtype == torch.float16: - logger.info( - f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})" - ) - self.ftype = gguf.LlamaFileType.MOSTLY_F16 - else: - logger.info( - f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})" - ) - self.ftype = gguf.LlamaFileType.MOSTLY_BF16 - - # Configure GGUF Writer - self.gguf_writer = gguf.GGUFWriter( - path=None, - arch=MODEL_ARCH_NAMES[self.model_arch], - endianess=self.endianess, - use_temp_file=self.use_temp_file, - split_max_tensors=split_max_tensors, - split_max_size=split_max_size, - dry_run=dry_run, - small_first_shard=small_first_shard, - ) - - @classmethod - def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: - stem, suffix = path.stem, path.suffix - new_name = f"{prefix}{stem}{suffix}" - return path.with_name(new_name) - - def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: - key = next((k for k in keys if k in self.hparams), None) - if key is not None: - return self.hparams[key] - if optional: - return None - raise KeyError(f"could not find any of: {keys}") - - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: - tensor_names_from_parts: set[str] = set() - - self.tensor_names = tensor_names_from_parts - weight_map: dict[str, str] = {} - - logger.info("gguf: loading 'consolidated.satensors'") - ctx: ContextManager[Any] - from safetensors import safe_open - - ctx = cast( - ContextManager[Any], - safe_open( - self.dir_model / "consolidated.safetensors", - framework="pt", - device="cpu", - ), - ) - - with ctx as model_part: - tensor_names_from_parts.update(model_part.keys()) - - for name in model_part.keys(): - if self.lazy: - data = model_part.get_slice(name) - data = LazyTorchTensor.from_safetensors_slice(data) - else: - data = model_part.get_tensor(name) - yield name, data - - # verify tensor name presence and identify potentially missing files - if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: - missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) - extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) - missing_files = sorted( - set(weight_map[n] for n in missing if n in weight_map) - ) - if len(extra) == 0 and len(missing_files) > 0: - raise ValueError( - f"Missing or incomplete model files: {missing_files}\n" - f"Missing tensors: {missing}" - ) - else: - raise ValueError( - "Mismatch between weight map and model parts for tensor names:\n" - f"Missing tensors: {missing}\n" - f"Extra tensors: {extra}" - ) - - def format_tensor_name( - self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight" - ) -> str: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - raise ValueError( - f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}" - ) - name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in name: - assert bid is not None - name = name.format(bid=bid) - return name + suffix - - def match_model_tensor_name( - self, - name: str, - key: gguf.MODEL_TENSOR, - bid: int | None, - suffix: str = ".weight", - ) -> bool: - if key not in gguf.MODEL_TENSORS[self.model_arch]: - return False - key_name: str = gguf.TENSOR_NAMES[key] - if "{bid}" in key_name: - if bid is None: - return False - key_name = key_name.format(bid=bid) - else: - if bid is not None: - return False - return name == (key_name + suffix) - - def map_tensor_name( - self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias") - ) -> str: - new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) - if new_name is None: - raise ValueError(f"Can not map tensor {name!r}") - return new_name - - def set_gguf_parameters(self): - raise NotImplementedError( - "set_gguf_parameters() must be implemented in subclasses" - ) - - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - return [(self.map_tensor_name(name), data_torch)] - - def prepare_tensors(self): - max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len( - ".weight," - ) - - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith( - (".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq") - ): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # use the first number-like part of the tensor name as the block id - bid = None - for part in name.split("."): - if part.isdecimal(): - bid = int(part) - break - - for new_name, data_torch in self.modify_tensors(data_torch, name, bid): - # hard coded for pixtral - if name == "vision_language_adapter.w_in.weight": - assert new_name == "mm.23.weight", new_name - new_name = "mm.1.weight" - elif name == "vision_language_adapter.w_out.weight": - assert new_name == "mm.23.weight", new_name - new_name = "mm.2.weight" - - data = data_torch.numpy() - - # if data ends up empty, it means data_torch was a scalar tensor -> restore - if len(data.shape) == 0: - data = data_torch.numpy() - - n_dims = len(data.shape) - data_qtype: gguf.GGMLQuantizationType | bool = False - - # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors - if n_dims <= 1 or new_name.endswith("_norm.weight"): - data_qtype = gguf.GGMLQuantizationType.F32 - - # Conditions should closely match those in llama_model_quantize_internal in llama.cpp - # Some tensor types are always in float32 - if data_qtype is False and ( - any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.FFN_GATE_INP, - gguf.MODEL_TENSOR.POS_EMBD, - gguf.MODEL_TENSOR.TOKEN_TYPES, - gguf.MODEL_TENSOR.V_ENC_EMBD_POS, - ) - ) - or not new_name.endswith(".weight") - ): - data_qtype = gguf.GGMLQuantizationType.F32 - - if data_qtype is False and any( - self.match_model_tensor_name(new_name, key, bid) - for key in ( - gguf.MODEL_TENSOR.TOKEN_EMBD, - gguf.MODEL_TENSOR.OUTPUT, - ) - ): - if self.ftype in ( - gguf.LlamaFileType.MOSTLY_TQ1_0, - gguf.LlamaFileType.MOSTLY_TQ2_0, - ): - # TODO: use Q4_K and Q6_K - data_qtype = gguf.GGMLQuantizationType.F16 - - # No override (data_qtype is False), or wants to be quantized (data_qtype is True) - if isinstance(data_qtype, bool): - if self.ftype == gguf.LlamaFileType.ALL_F32: - data_qtype = gguf.GGMLQuantizationType.F32 - elif self.ftype == gguf.LlamaFileType.MOSTLY_F16: - data_qtype = gguf.GGMLQuantizationType.F16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16: - data_qtype = gguf.GGMLQuantizationType.BF16 - elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0: - data_qtype = gguf.GGMLQuantizationType.Q8_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ1_0: - data_qtype = gguf.GGMLQuantizationType.TQ1_0 - elif self.ftype == gguf.LlamaFileType.MOSTLY_TQ2_0: - data_qtype = gguf.GGMLQuantizationType.TQ2_0 - else: - raise ValueError(f"Unknown file type: {self.ftype.name}") - - try: - data = gguf.quants.quantize(data, data_qtype) - except gguf.QuantError as e: - logger.warning("%s, %s", e, "falling back to F16") - data_qtype = gguf.GGMLQuantizationType.F16 - data = gguf.quants.quantize(data, data_qtype) - - shape = ( - gguf.quant_shape_from_byte_shape(data.shape, data_qtype) - if data.dtype == np.uint8 - else data.shape - ) - - # reverse shape to make it similar to the internal ggml dimension order - shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}" - - # n_dims is implicit in the shape - logger.info( - f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}" - ) - - self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype) - - def set_type(self): - self.gguf_writer.add_type(gguf.GGUFType.MODEL) - - def prepare_metadata(self): - total_params, shared_params, expert_params, expert_count = ( - self.gguf_writer.get_total_parameter_count() - ) - - self.metadata = gguf.Metadata.load( - self.metadata_override, self.dir_model_card, self.model_name, total_params - ) - - # If we are using HF model id, set the metadata name to the model id - if self.remote_hf_model_id: - self.metadata.name = self.remote_hf_model_id - - # Fallback to model directory name if metadata name is still missing - if self.metadata.name is None: - self.metadata.name = self.dir_model.name - - # Generate parameter weight class (useful for leader boards) if not yet determined - if self.metadata.size_label is None and total_params > 0: - self.metadata.size_label = gguf.size_label( - total_params, shared_params, expert_params, expert_count - ) - - self.set_type() - - logger.info("Set meta model") - self.metadata.set_gguf_meta_model(self.gguf_writer) - - logger.info("Set model parameters") - self.set_gguf_parameters() - - logger.info("Set model quantization version") - self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION) - - def write(self): - self.prepare_tensors() - self.prepare_metadata() - self.gguf_writer.write_header_to_file(path=self.fname_out) - self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file(progress=True) - self.gguf_writer.close() - - @staticmethod - def load_hparams(dir_model: Path): - with open(dir_model / "params.json", "r", encoding="utf-8") as f: - config = json.load(f) - return config - - -class TextModel(ModelBase): - model_type = ModelType.TEXT - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - if "text_config" in self.hparams: - # move the text_config to the root level - self.hparams = {**self.hparams, **self.hparams["text_config"]} - - self.block_count = self.find_hparam( - ["n_layers", "num_hidden_layers", "n_layer", "num_layers"] - ) - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def set_vocab(self): - logger.info( - f"Converting tokenizer {self.vocab.tokenizer_type} of size {self.vocab.vocab_size}." - ) - - self.gguf_writer.add_tokenizer_model(self.vocab.gguf_tokenizer_model) - - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in self.vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == self.vocab.vocab_size, ( - f"token count ({len(tokens)}) != vocab size ({self.vocab.vocab_size})" - ) - - if self.vocab.tokenizer_type == MistralTokenizerType.tekken: - self.gguf_writer.add_tokenizer_pre("tekken") - self.gguf_writer.add_token_merges( - self.vocab.extract_vocab_merges_from_model() - ) - - logger.info( - f"Setting bos, eos, unk and pad token IDs to {self.vocab.bos_id}, {self.vocab.eos_id}, {self.vocab.unk_id}, {self.vocab.pad_id}." - ) - - self.gguf_writer.add_bos_token_id(self.vocab.bos_id) - self.gguf_writer.add_eos_token_id(self.vocab.eos_id) - self.gguf_writer.add_unk_token_id(self.vocab.unk_id) - self.gguf_writer.add_pad_token_id(self.vocab.pad_id) - - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_vocab_size(self.vocab.vocab_size) - - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(False) - - def set_vocab_none(self): - logger.info("Skipping tokenizer conversion.") - logger.info("Setting tokenizer to 'none'.") - self.gguf_writer.add_tokenizer_model("none") - - logger.info( - f"Setting bos, eos, unk and pad token IDs to {self.vocab.bos_id}, {self.vocab.eos_id}, {self.vocab.unk_id}, {self.vocab.pad_id}." - ) - self.gguf_writer.add_bos_token_id(self.vocab.bos_id) - self.gguf_writer.add_eos_token_id(self.vocab.eos_id) - self.gguf_writer.add_unk_token_id(self.vocab.unk_id) - self.gguf_writer.add_pad_token_id(self.vocab.pad_id) - - logger.info(f"Setting vocab size to {self.vocab.vocab_size}.") - self.gguf_writer.add_vocab_size(self.vocab.vocab_size) - - self.gguf_writer.add_add_bos_token(False) - self.gguf_writer.add_add_eos_token(False) - - def prepare_metadata(self): - super().prepare_metadata() - - total_params = self.gguf_writer.get_total_parameter_count()[0] - # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0' - output_type: str = self.ftype.name.partition("_")[2] - - # Filename Output - if self.fname_out.is_dir(): - # Generate default filename based on model specification and available metadata - fname_default: str = gguf.naming_convention( - self.metadata.name, - self.metadata.basename, - self.metadata.finetune, - self.metadata.version, - self.metadata.size_label, - output_type, - model_type="LoRA" if total_params < 0 else None, - ) - - # Use the default filename - self.fname_out = self.fname_out / f"{fname_default}.gguf" - else: - # Output path is a custom defined templated filename - # Note: `not is_dir()` is used because `.is_file()` will not detect - # file template strings as it doesn't actually exist as a file - - # Process templated file name with the output ftype, useful with the "auto" ftype - self.fname_out = self.fname_out.parent / gguf.fill_templated_filename( - self.fname_out.name, output_type - ) - - logger.info("Set model tokenizer") - self.set_vocab() - - def set_gguf_parameters(self): - self.gguf_writer.add_block_count(self.block_count) - - if self.ctx == 0: - raise ValueError("ctx not passed as argument") - self.gguf_writer.add_context_length(self.ctx) - logger.info(f"gguf: training context length = {self.ctx}") - - if (n_embd := self.find_hparam(["dim"], optional=True)) is not None: - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") - - if (n_ff := self.find_hparam(["hidden_dim"], optional=True)) is not None: - self.gguf_writer.add_feed_forward_length(n_ff) - logger.info(f"gguf: feed forward length = {n_ff}") - - if (n_head := self.find_hparam(["n_heads"], optional=True)) is not None: - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") - - if (n_head_kv := self.hparams.get("n_kv_heads")) is not None: - self.gguf_writer.add_head_count_kv(n_head_kv) - logger.info(f"gguf: key-value head count = {n_head_kv}") - - if (rope_theta := self.hparams.get("rope_theta")) is not None: - self.gguf_writer.add_rope_freq_base(rope_theta) - logger.info(f"gguf: rope theta = {rope_theta}") - - if (f_norm_eps := self.find_hparam(["norm_eps"], optional=True)) is not None: - self.gguf_writer.add_layer_norm_rms_eps(f_norm_eps) - logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") - - if (head_dim := self.hparams.get("head_dim")) is not None: - self.gguf_writer.add_key_length(head_dim) - self.gguf_writer.add_value_length(head_dim) - - self.gguf_writer.add_file_type(self.ftype) - logger.info(f"gguf: file type = {self.ftype}") - - -class MmprojModel(ModelBase): - model_type = ModelType.MMPROJ - model_arch = gguf.MODEL_ARCH.MMPROJ - preprocessor_config: dict[str, Any] - global_config: dict[str, Any] - - n_block_keys = ["num_hidden_layers"] - - has_vision_encoder: bool = True - - hparams_vision: dict[str, Any] - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - text_config = { - k: v for k, v in self.hparams.items() if k not in ["vision_encoder"] - } - self.n_embd_text = text_config.get("hidden_dim", 0) - assert self.n_embd_text > 0, "n_embd not found in hparams" - - # move vision config to the top level, while preserving the original hparams in global_config - import copy - - self.global_config = copy.deepcopy(self.hparams) - self.hparams_vision = self.get_vision_config() - - self.block_count = self.hparams_vision.get("num_hidden_layers", 0) - assert self.block_count > 0, "num_hidden_layers not found in vision_config" - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - - def get_vision_config(self) -> dict[str, Any]: - vision_config = self.global_config.get("vision_encoder") - assert vision_config is not None, "vision_config not found in hparams" - return vision_config - - def set_type(self): - self.gguf_writer.add_type(gguf.GGUFType.MMPROJ) - - def set_gguf_parameters(self): - self.gguf_writer.add_file_type(self.ftype) - - if not self.has_vision_encoder: - raise ValueError("MmprojModel must have a vision encoder") - - def find_vparam(self, keys: Iterable[str], optional: bool = False) -> Any: - assert self.hparams_vision is not None - return self._find_param(self.hparams_vision, keys, optional) - - def _find_param( - self, obj: dict[str, Any], keys: Iterable[str], optional: bool = False - ) -> Any: - key = next((k for k in keys if k in obj), None) - if key is not None: - return obj[key] - if optional: - return None - raise KeyError(f"could not find any of: {keys}") - - -class MistralModel(TextModel): - model_name = "mistral" - model_arch = MODEL_ARCH.LLAMA - undo_permute = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def set_gguf_parameters(self): - super().set_gguf_parameters() - hparams = self.hparams - - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] - else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - - rope_scaling = self.hparams.get("rope_scaling") or {} - if ( - rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" - and "factor" in rope_scaling - ): - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) - .swapaxes(1, 2) - .reshape(weights.shape) - ) - - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["n_heads"] - n_kv_head = self.hparams.get("n_kv_heads") - is_vision_tensor = any( - name.startswith(prefix) - for prefix in [ - "vision_encoder.", - "vision_language_adapter.", - "patch_merger.", - "pre_mm_projector_norm", - ] - ) - - if is_vision_tensor: - return [] # skip vision tensors - - if self.undo_permute: - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = self.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = self.permute(data_torch, n_head, n_kv_head) - - return [(self.map_tensor_name(name), data_torch)] - - -class PixtralModel(MmprojModel): - model_name = "mistral" - img_break_tok_id = -1 - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py - self.hparams["layer_norm_eps"] = self.hparams.get("norm_eps", 1e-5) - self.img_break_tok_id = self.hparams_vision.get("image_break_token_id", -1) - assert self.img_break_tok_id >= 0, ( - "image_break_token_id not found in vision_config" - ) - logger.info(f"Image break token id: {self.img_break_tok_id}") - - def set_gguf_parameters(self): - super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) - - self.gguf_writer.add_clip_has_vision_encoder(True) - self.gguf_writer.add_vision_projection_dim(self.n_embd_text) - - # vision config - self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"])) - self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"])) - self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"])) - self.gguf_writer.add_vision_feed_forward_length( - self.find_vparam(["intermediate_size"]) - ) - self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys)) - self.gguf_writer.add_vision_head_count( - self.find_vparam(["num_attention_heads"]) - ) - - # preprocessor config - self.gguf_writer.add_vision_image_mean( - self.hparams_vision.get("image_mean", DATASET_MEAN) - ) - self.gguf_writer.add_vision_image_std( - self.hparams_vision.get("image_std", DATASET_STD) - ) - - self.gguf_writer.add_vision_attention_layernorm_eps( - self.find_hparam(["layer_norm_eps"]) - ) - self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"])) - - self.gguf_writer.add_vision_use_silu(True) - - # spatial_merge_size - if self.hparams_vision["mm_projector_id"] == "patch_merge": - self.gguf_writer.add_vision_spatial_merge_size( - self.find_vparam(["spatial_merge_size"]) - ) - - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.hparams_vision["num_attention_heads"] - n_kv_head = n_head - - if any( - name.startswith(prefix) - for prefix in [ - "vision_encoder.", - "vision_language_adapter.", - "patch_merger.", - "pre_mm_projector_norm", - ] - ): - # process vision tensors - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = MistralModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = MistralModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] - - if self.img_break_tok_id > 0 and "tok_embeddings.weight" in name: - logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") - # for pixtral model, we need to extract the [IMG_BREAK] token embedding - img_break_embd = data_torch[self.img_break_tok_id] - name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] - return [(self.map_tensor_name(name), img_break_embd)] - - return [] # skip other tensors - - -# tree of lazy tensors -class LazyTorchTensor(gguf.LazyBase): - _tensor_type = torch.Tensor - # to keep the type-checker happy - dtype: torch.dtype - shape: torch.Size - - # only used when converting a torch.Tensor to a np.ndarray - _dtype_map: dict[torch.dtype, type] = { - torch.float16: np.float16, - torch.float32: np.float32, - } - - # used for safetensors slices - # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 - # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 - _dtype_str_map: dict[str, torch.dtype] = { - "F64": torch.float64, - "F32": torch.float32, - "BF16": torch.bfloat16, - "F16": torch.float16, - # "U64": torch.uint64, - "I64": torch.int64, - # "U32": torch.uint32, - "I32": torch.int32, - # "U16": torch.uint16, - "I16": torch.int16, - "U8": torch.uint8, - "I8": torch.int8, - "BOOL": torch.bool, - "F8_E4M3": torch.float8_e4m3fn, - "F8_E5M2": torch.float8_e5m2, - } - - def numpy(self) -> gguf.LazyNumpyTensor: - dtype = self._dtype_map[self.dtype] - return gguf.LazyNumpyTensor( - meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape), - args=(self,), - func=(lambda s: s.numpy()), - ) - - @classmethod - def meta_with_dtype_and_shape( - cls, dtype: torch.dtype, shape: tuple[int, ...] - ) -> Tensor: - return torch.empty(size=shape, dtype=dtype, device="meta") - - @classmethod - def from_safetensors_slice(cls, st_slice: Any) -> Tensor: - dtype = cls._dtype_str_map[st_slice.get_dtype()] - shape: tuple[int, ...] = tuple(st_slice.get_shape()) - lazy = cls( - meta=cls.meta_with_dtype_and_shape(dtype, shape), - args=(st_slice,), - func=lambda s: s[:], - ) - return cast(torch.Tensor, lazy) - - @classmethod - def from_remote_tensor(cls, remote_tensor: gguf.utility.RemoteTensor): - dtype = cls._dtype_str_map[remote_tensor.dtype] - shape = remote_tensor.shape - meta = cls.meta_with_dtype_and_shape(dtype, shape) - lazy = cls( - meta=meta, - args=(remote_tensor,), - func=lambda r: torch.frombuffer(r.data(), dtype=dtype).reshape(shape), - ) - return cast(torch.Tensor, lazy) - - @classmethod - def __torch_function__(cls, func, types, args=(), kwargs=None): - del types # unused - - if kwargs is None: - kwargs = {} - - if func is torch.Tensor.numpy: - return args[0].numpy() - - return cls._wrap_fn(func)(*args, **kwargs) - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - description="Convert a huggingface model to a GGML compatible file" - ) - parser.add_argument( - "--outfile", - type=Path, - help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", - ) - parser.add_argument( - "--outtype", - type=str, - choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], - default="bf16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", - ) - parser.add_argument( - "--bigendian", - action="store_true", - help="model is executed on big endian machine", - ) - parser.add_argument( - "model", - type=Path, - help="directory containing model file", - nargs="?", - ) - parser.add_argument( - "--ctx-train", - type=int, - help="Training context size", - required=False, - ) - parser.add_argument( - "--use-temp-file", - action="store_true", - help="use the tempfile library while processing (helpful when running out of memory, process killed)", - ) - parser.add_argument( - "--no-lazy", - action="store_true", - help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", - ) - parser.add_argument( - "--model-name", - type=str, - default=None, - help="name of the model", - ) - parser.add_argument( - "--verbose", - action="store_true", - help="increase output verbosity", - ) - parser.add_argument( - "--split-max-tensors", - type=int, - default=0, - help="max tensors in each split", - ) - parser.add_argument( - "--split-max-size", - type=str, - default="0", - help="max size per split N(M|G)", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="only print out a split plan and exit, without writing any new files", - ) - parser.add_argument( - "--no-tensor-first-split", - action="store_true", - help="do not add tensors to the first split (disabled by default)", - ) - parser.add_argument( - "--metadata", - type=Path, - help="Specify the path for an authorship metadata override file", - ) - parser.add_argument( - "--remote", - action="store_true", - help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'mistralai/Mistral-Small-3.2-24B-Instruct-2506'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.", - ) - parser.add_argument( - "--mmproj", - action="store_true", - help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", - ) - - args = parser.parse_args() - return args - - -def split_str_to_n_bytes(split_str: str) -> int: - if split_str.endswith("K"): - n = int(split_str[:-1]) * 1000 - elif split_str.endswith("M"): - n = int(split_str[:-1]) * 1000 * 1000 - elif split_str.endswith("G"): - n = int(split_str[:-1]) * 1000 * 1000 * 1000 - elif split_str.isnumeric(): - n = int(split_str) - else: - raise ValueError( - f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G" - ) - - if n < 0: - raise ValueError(f"Invalid split size: {split_str}, must be positive") - - return n - - -def main() -> None: - args = parse_args() - - if args.verbose: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) - - dir_model = args.model - - if args.remote: - from huggingface_hub import snapshot_download - - local_dir = snapshot_download( - repo_id=str(dir_model), - allow_patterns=[ - "LICENSE", - "params.json", - "tekken.json", - "*.md", - "tokenizer.model", - ], - ) - dir_model = Path(local_dir) - logger.info(f"Downloaded config and tokenizer to {local_dir}") - - if not dir_model.is_dir(): - logger.error(f"Error: {args.model} is not a directory") - sys.exit(1) - - ftype_map: dict[str, gguf.LlamaFileType] = { - "f32": gguf.LlamaFileType.ALL_F32, - "f16": gguf.LlamaFileType.MOSTLY_F16, - "bf16": gguf.LlamaFileType.MOSTLY_BF16, - "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, - "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, - "auto": gguf.LlamaFileType.GUESSED, - } - - is_split = args.split_max_tensors > 0 or args.split_max_size != "0" - if args.use_temp_file and is_split: - logger.error("Error: Cannot use temp file when splitting") - sys.exit(1) - - if args.outfile is not None: - fname_out = args.outfile - elif args.remote: - # if remote, use the model ID as the output file name - fname_out = Path("./" + str(args.model).replace("/", "-") + "-{ftype}.gguf") - else: - fname_out = dir_model - - logger.info(f"Loading model: {dir_model.name}") - - with torch.inference_mode(): - output_type = ftype_map[args.outtype] - hparams = ModelBase.load_hparams(dir_model) - model_class: Type[ModelBase] - if args.mmproj and hparams.get("vision_encoder") is not None: - model_class = PixtralModel - elif args.mmproj: - raise ValueError( - "Multimodal projector export is only supported for vision models" - ) - else: - model_class = MistralModel - logger.info(f"Model architecture: {model_class.__name__}") - - model_instance = model_class( - dir_model, - output_type, - fname_out, - is_big_endian=args.bigendian, - use_temp_file=args.use_temp_file, - eager=args.no_lazy, - metadata_override=args.metadata, - model_name=args.model_name, - split_max_tensors=args.split_max_tensors, - split_max_size=split_str_to_n_bytes(args.split_max_size), - dry_run=args.dry_run, - small_first_shard=args.no_tensor_first_split, - remote_hf_model_id=str(args.model) if args.remote else None, - ctx=args.ctx_train, - ) - - logger.info("Exporting model...") - model_instance.write() - out_path = ( - f"{model_instance.fname_out.parent}{os.sep}" - if is_split - else model_instance.fname_out - ) - logger.info(f"Model successfully exported to {out_path}") - - -if __name__ == "__main__": - main() diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py index 8354bd922c1b7..3899a83e021d4 100644 --- a/gguf-py/gguf/utility.py +++ b/gguf-py/gguf/utility.py @@ -120,7 +120,7 @@ def get_list_tensors_model(cls, model_id: str) -> dict[str, RemoteTensor]: """ # case 1: model has only one single model.safetensor file is_single_file = cls.check_file_exist(f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors") - is_single_file_consolidated = cls.check_file_exist(f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/consolidated.safetensors", user_agent="convert_mistral_to_gguf") + is_single_file_consolidated = cls.check_file_exist(f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/consolidated.safetensors") if is_single_file: url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors" return cls.get_list_tensors(url) @@ -242,7 +242,7 @@ def get_data_by_range(cls, url: str, start: int, size: int = -1) -> bytes: return response.content[slice(size if size > -1 else None)] @classmethod - def check_file_exist(cls, url: str, user_agent="convert_hf_to_gguf") -> bool: + def check_file_exist(cls, url: str) -> bool: """ Check if a file exists at the given URL. Returns True if the file exists, False otherwise. @@ -255,7 +255,7 @@ def check_file_exist(cls, url: str, user_agent="convert_hf_to_gguf") -> bool: raise ValueError(f"Invalid URL: {url}") try: - headers = cls._get_request_headers(user_agent=user_agent) + headers = cls._get_request_headers() headers["Range"] = "bytes=0-0" response = requests.head(url, allow_redirects=True, headers=headers) # Success (2xx) or redirect (3xx) @@ -264,9 +264,9 @@ def check_file_exist(cls, url: str, user_agent="convert_hf_to_gguf") -> bool: return False @classmethod - def _get_request_headers(cls, user_agent="convert_hf_to_gguf") -> dict[str, str]: + def _get_request_headers(cls) -> dict[str, str]: """Prepare common headers for requests.""" - headers = {"User-Agent": user_agent} + headers = {"User-Agent": "convert_hf_to_gguf"} if os.environ.get("HF_TOKEN"): headers["Authorization"] = f"Bearer {os.environ['HF_TOKEN']}" return headers diff --git a/pyproject.toml b/pyproject.toml index 69ea98c1dbb8a..3d71b055a8dbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,5 +42,4 @@ build-backend = "poetry.core.masonry.api" llama-convert-hf-to-gguf = "convert_hf_to_gguf:main" llama-convert-lora-to-gguf = "convert_lora_to_gguf:main" llama-convert-llama-ggml-to-gguf = "convert_llama_ggml_to_gguf:main" -llama-convert-mistral-to-gguf = "convert_mistral_to_gguf:main" llama-ggml-vk-generate-shaders = "ggml_vk_generate_shaders:main" diff --git a/requirements.txt b/requirements.txt index 9120254ca1f49..f2a18d62879b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,5 +10,4 @@ -r ./requirements/requirements-convert_hf_to_gguf_update.txt -r ./requirements/requirements-convert_llama_ggml_to_gguf.txt -r ./requirements/requirements-convert_lora_to_gguf.txt --r ./requirements/requirements-convert_mistral_to_gguf.txt -r ./requirements/requirements-tool_bench.txt diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index dc0b83d09b8fc..56b6752ac0645 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -11,7 +11,6 @@ -r ./requirements-convert_hf_to_gguf_update.txt -r ./requirements-convert_legacy_llama.txt -r ./requirements-convert_llama_ggml_to_gguf.txt --r ./requirements-convert_mistral_to_gguf.txt -r ./requirements-tool_bench.txt -r ./requirements-gguf_editor_gui.txt diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt index fd21ec479541f..4da85fcf6051c 100644 --- a/requirements/requirements-convert_hf_to_gguf.txt +++ b/requirements/requirements-convert_hf_to_gguf.txt @@ -1,6 +1,7 @@ mistral-common>=1.8.3 -r ./requirements-convert_legacy_llama.txt +mistral-common>=1.8.3 --extra-index-url https://download.pytorch.org/whl/cpu torch~=2.2.1; platform_machine != "s390x" diff --git a/requirements/requirements-convert_mistral_to_gguf.txt b/requirements/requirements-convert_mistral_to_gguf.txt deleted file mode 100644 index 5616161201eef..0000000000000 --- a/requirements/requirements-convert_mistral_to_gguf.txt +++ /dev/null @@ -1,13 +0,0 @@ -numpy<2.0.0 -gguf>=0.1.0 -protobuf>=4.21.0,<5.0.0 -mistral-common>=1.8.0 -safetensors>=0.5.3 -huggingface_hub>=0.23.2 - ---extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.2.1; platform_machine != "s390x" - -# torch s390x packages can only be found from nightly builds ---extra-index-url https://download.pytorch.org/whl/nightly -torch>=0.0.0.dev0; platform_machine == "s390x" From d1b633c2549d5e81a43e60117eacb15fe4197692 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Mon, 28 Jul 2025 10:22:01 +0200 Subject: [PATCH 05/19] Revert collateral --- convert_hf_to_gguf.py | 8 ++++---- gguf-py/gguf/utility.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0b02f37874efc..da5ddd01e81d6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -120,9 +120,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_model(remote_hf_model_id) + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) self.tensor_names = set(name for name in remote_tensors.keys()) - for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_model(remote_hf_model_id).items(): + for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items(): yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) self.get_tensors = get_remote_tensors @@ -4282,7 +4282,7 @@ class BertModel(TextModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - vocab_size = None + self.vocab_size = None if cls_out_labels := self.hparams.get("id2label"): if len(cls_out_labels) == 2 and cls_out_labels[0] == "LABEL_0": @@ -4300,7 +4300,7 @@ def set_gguf_parameters(self): def set_vocab(self): tokens, toktypes, tokpre = self.get_vocab_base() - vocab_size = len(tokens) + self.vocab_size = len(tokens) # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py index 3899a83e021d4..7f275a5731ef2 100644 --- a/gguf-py/gguf/utility.py +++ b/gguf-py/gguf/utility.py @@ -111,7 +111,7 @@ class SafetensorRemote: ALIGNMENT = 8 # bytes @classmethod - def get_list_tensors_model(cls, model_id: str) -> dict[str, RemoteTensor]: + def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]: """ Get list of tensors from a Hugging Face model repository. From 02e932f1131d540c91fee187abcf2143e2ebc3e6 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Mon, 28 Jul 2025 10:23:47 +0200 Subject: [PATCH 06/19] Rename model name --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index da5ddd01e81d6..5a16d16970ab9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7840,7 +7840,7 @@ def prepare_tensors(self): class MistralModel(TextModel): - model_name = "mistral" + model_name = "Mistral" model_arch = MODEL_ARCH.LLAMA undo_permute = True @@ -7905,7 +7905,7 @@ def modify_tensors( class PixtralModel(MmprojModel): - model_name = "mistral" + model_name = "Pixtral" img_break_tok_id = -1 def __init__(self, *args, **kwargs): From b374360efd0fab54180f74f69196e98159ed2b93 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Mon, 28 Jul 2025 20:43:09 +0200 Subject: [PATCH 07/19] refactor --- convert_hf_to_gguf.py | 275 ++++++++++++++++----------------- gguf-py/gguf/tensor_mapping.py | 2 - 2 files changed, 137 insertions(+), 140 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5a16d16970ab9..5fc7fb2ec69a3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -89,12 +89,14 @@ class ModelBase: block_count: int tensor_map: gguf.TensorNameMap + is_mistral_format: bool = False + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, - remote_hf_model_id: str | None = None, n_ctx: int = 0, is_mistral_format: bool = False): + remote_hf_model_id: str | None = None): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: @@ -109,11 +111,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.use_temp_file = use_temp_file self.lazy = not eager or (remote_hf_model_id is not None) self.remote_hf_model_id = remote_hf_model_id - self.n_ctx = n_ctx - self.is_mistral_format = is_mistral_format - - if is_mistral_format and not n_ctx: - raise ValueError("Please pass the context length using --ctx when using mistral formats.") if remote_hf_model_id is not None: self.is_safetensors = True @@ -127,12 +124,12 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: self.get_tensors = get_remote_tensors else: - prefix = "model" if not is_mistral_format else "consolidated" + prefix = "model" if not self.is_mistral_format else "consolidated" self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - self.hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format) if hparams is None else hparams + self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams self.tensor_names = None self.metadata_override = metadata_override self.model_name = model_name @@ -296,14 +293,6 @@ def prepare_tensors(self): break for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): - # hard coded for pixtral - if name == "vision_language_adapter.w_in.weight": - assert new_name == "mm.23.weight", new_name - new_name = "mm.1.weight" - elif name == "vision_language_adapter.w_out.weight": - assert new_name == "mm.23.weight", new_name - new_name = "mm.2.weight" - # TODO: why do we squeeze here? # data = data_torch.squeeze().numpy() data = data_torch.numpy() @@ -566,12 +555,7 @@ def prepare_metadata(self, vocab_only: bool): def set_gguf_parameters(self): self.gguf_writer.add_block_count(self.block_count) - if self.is_mistral_format: - n_ctx = self.n_ctx - else: - n_ctx = self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True) - - if n_ctx is not None: + if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx", "n_positions", "max_length"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") @@ -2014,10 +1998,9 @@ def __init__(self, *args, **kwargs): self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) def set_vocab(self): - path_tekken_json = self.dir_model / "tekken.json" - path_tokenizer_json = self.dir_model / "tokenizer.json" - if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): - return self.set_vocab_tekken() + if self.is_mistral_format: + self._set_vocab_mistral() + return try: self._set_vocab_sentencepiece() @@ -2100,7 +2083,9 @@ def set_vocab_tekken(self): def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if not self.is_mistral_format: + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) if (rope_dim := hparams.get("head_dim")) is None: rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] @@ -2122,13 +2107,25 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") + n_head = self.find_hparam(["n_heads", "num_attention_heads"]) + n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) + + vision_prefixes = [ + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ] + is_multimodal_tensor = "vision_tower" in name \ or "vision_model" in name \ or "audio_tower" in name \ or "model.connector" in name \ - or "multi_modal_projector" in name + or "multi_modal_projector" in name \ + or any( + name.startswith(prefix) + for prefix in vision_prefixes + ) if is_multimodal_tensor: return [] # skip vision tensors @@ -2244,13 +2241,16 @@ class LlavaVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if self.hparams["model_type"] == "pixtral": + if self.hparams.get("model_type") == "pixtral": # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") - logger.info(f"Image break token id: {self.img_break_tok_id}") + elif self.is_mistral_format: + self.hparams["layer_norm_eps"] = self.hparams.get("norm_eps", 1e-5) + self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) else: raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") + logger.info(f"Image break token id: {self.img_break_tok_id}") def get_token_id(self, token: str) -> int: tokenizer_config_file = self.dir_model / 'tokenizer_config.json' @@ -2264,7 +2264,7 @@ def get_token_id(self, token: str) -> int: def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - if hparams["model_type"] == "pixtral": + if hparams.get("model_type") == "pixtral": self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) @@ -2282,18 +2282,30 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - n_head = self.hparams["num_attention_heads"] + n_head = ( + self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"]) + ) n_kv_head = n_head - if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."): + valid_prefixes = ( + "multi_modal_projector.", + "vision_tower.", + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ) + + if any(name.startswith(prefix) for prefix in valid_prefixes): # process vision tensors - if name.endswith(("q_proj.weight", "q_proj.bias")): + if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format: data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): + if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] - if self.img_break_tok_id > 0 and "embed_tokens.weight" in name: + embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" + if self.img_break_tok_id > 0 and embed_key in name: logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") # for pixtral model, we need to extract the [IMG_BREAK] token embedding img_break_embd = data_torch[self.img_break_tok_id] @@ -7839,81 +7851,101 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -class MistralModel(TextModel): - model_name = "Mistral" - model_arch = MODEL_ARCH.LLAMA - undo_permute = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) +@ModelBase.register("SmallThinkerForCausalLM") +class SmallThinkerModel(TextModel): + model_arch = gguf.MODEL_ARCH.SMALLTHINKER def set_gguf_parameters(self): super().set_gguf_parameters() - hparams = self.hparams - - if "head_dim" in hparams: - rope_dim = hparams["head_dim"] + if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None: + self.gguf_writer.add_expert_count(n_experts) + if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + self.gguf_writer.add_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + if (self.hparams.get('moe_primary_router_apply_softmax')): + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) else: - rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(rope_dim) - + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + # YaRN is not enabled by default + # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts rope_scaling = self.hparams.get("rope_scaling") or {} - if ( - rope_scaling.get("rope_type", rope_scaling.get("type")) == "linear" - and "factor" in rope_scaling - ): - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - @staticmethod - def permute(weights: Tensor, n_head: int, n_head_kv: int | None): - if n_head_kv is not None and n_head != n_head_kv: - n_head = n_head_kv - return ( - weights.reshape( - n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:] - ) - .swapaxes(1, 2) - .reshape(weights.shape) - ) + sliding_window_layout = self.hparams.get("sliding_window_layout") + if sliding_window_layout: + for i in sliding_window_layout: + if i != 0: + sliding_window = self.hparams.get("sliding_window_size") + if sliding_window: + self.gguf_writer.add_sliding_window(sliding_window) + break - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["n_heads"] - n_kv_head = self.hparams.get("n_kv_heads") - is_vision_tensor = any( - name.startswith(prefix) - for prefix in [ - "vision_encoder.", - "vision_language_adapter.", - "patch_merger.", - "pre_mm_projector_norm", - ] - ) + _experts: list[dict[str, Tensor]] | None = None - if is_vision_tensor: - return [] # skip vision tensors + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts")) + assert bid is not None - if self.undo_permute: - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = self.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = self.permute(data_torch, n_head, n_kv_head) + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down", "gate", "up"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] return [(self.map_tensor_name(name), data_torch)] + def prepare_tensors(self): + super().prepare_tensors() -class PixtralModel(MmprojModel): - model_name = "Pixtral" - img_break_tok_id = -1 + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py - self.hparams["layer_norm_eps"] = self.hparams.get("norm_eps", 1e-5) - self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) - logger.info(f"Image break token id: {self.img_break_tok_id}") + +class MistralModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + model_name = "Mistral" + hf_arch = "" + is_mistral_format = True + undo_permute = True + + +class PixtralModel(LlavaVisionModel): + model_name = "Pixtral" + hf_arch = "" + is_mistral_format = True def set_gguf_parameters(self): super().set_gguf_parameters() @@ -7931,38 +7963,13 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_spatial_merge_size( self.find_vparam(["spatial_merge_size"]) ) - - def modify_tensors( - self, data_torch: Tensor, name: str, bid: int | None - ) -> Iterable[tuple[str, Tensor]]: - del bid # unused - n_head = self.find_vparam(["num_attention_heads"]) - n_kv_head = n_head - - if any( - name.startswith(prefix) - for prefix in [ - "vision_encoder.", - "vision_language_adapter.", - "patch_merger.", - "pre_mm_projector_norm", - ] - ): - # process vision tensors - if name.endswith(("q_proj.weight", "q_proj.bias")): - data_torch = MistralModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): - data_torch = MistralModel.permute(data_torch, n_head, n_kv_head) - return [(self.map_tensor_name(name), data_torch)] - - if self.img_break_tok_id > 0 and "tok_embeddings.weight" in name: - logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") - # for pixtral model, we need to extract the [IMG_BREAK] token embedding - img_break_embd = data_torch[self.img_break_tok_id] - name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK] - return [(self.map_tensor_name(name), img_break_embd)] - - return [] # skip other tensors + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + if name == "vision_language_adapter.w_in.weight": + return "mm.1.weight" + elif name == "vision_language_adapter.w_out.weight": + return "mm.2.weight" + return super().map_tensor_name(name, try_suffixes) ###### CONVERSION LOGIC ###### @@ -8117,12 +8124,6 @@ def parse_args() -> argparse.Namespace: "--mistral-format", action="store_true", help="Whether the model is stored following the Mistral format.", ) - parser.add_argument( - "--n-ctx", - type=int, - help="Training context size", - default=0 - ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -8255,8 +8256,6 @@ def main() -> None: split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, - n_ctx=args.n_ctx, - is_mistral_format=is_mistral_format ) if args.vocab_only: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index c2ff3ce3a8cd1..0299a857b1240 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1066,8 +1066,6 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ: ( "multi_modal_projector.linear_{bid}", "visual.merger.mlp.{bid}", # qwen2vl - "vision_language_adapter.w_in", # pixtral - "vision_language_adapter.w_out", # pixtral ), MODEL_TENSOR.V_MMPROJ_FC: ( From 9b5d9a8aeac49cf8065311457d6e60b8ce2fafc7 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Tue, 29 Jul 2025 14:52:45 +0200 Subject: [PATCH 08/19] revert --- convert_hf_to_gguf.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5fc7fb2ec69a3..7d7b62476bcd4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2002,6 +2002,11 @@ def set_vocab(self): self._set_vocab_mistral() return + path_tekken_json = self.dir_model / "tekken.json" + path_tokenizer_json = self.dir_model / "tokenizer.json" + if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): + return self.set_vocab_tekken() + try: self._set_vocab_sentencepiece() except FileNotFoundError: From 3f490176515158c8202998c979fe5483f33b6b81 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Tue, 29 Jul 2025 14:54:33 +0200 Subject: [PATCH 09/19] remove duplicate --- convert_hf_to_gguf.py | 83 ------------------------------------------- 1 file changed, 83 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7d7b62476bcd4..0febfb3c09872 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7856,89 +7856,6 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("SmallThinkerForCausalLM") -class SmallThinkerModel(TextModel): - model_arch = gguf.MODEL_ARCH.SMALLTHINKER - - def set_gguf_parameters(self): - super().set_gguf_parameters() - if (n_experts := self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts"))) is not None: - self.gguf_writer.add_expert_count(n_experts) - if (n_experts_used := self.hparams.get("num_experts_per_tok", self.hparams.get("moe_num_active_primary_experts"))) is not None: - self.gguf_writer.add_expert_used_count(n_experts_used) - if (moe_intermediate_size := self.hparams.get("moe_ffn_hidden_size")) is not None: - self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) - self.gguf_writer.add_feed_forward_length(moe_intermediate_size) - logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - if (self.hparams.get('moe_primary_router_apply_softmax')): - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) - else: - self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts - rope_scaling = self.hparams.get("rope_scaling") or {} - if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: - self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) - self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) - self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) - - sliding_window_layout = self.hparams.get("sliding_window_layout") - if sliding_window_layout: - for i in sliding_window_layout: - if i != 0: - sliding_window = self.hparams.get("sliding_window_size") - if sliding_window: - self.gguf_writer.add_sliding_window(sliding_window) - break - - _experts: list[dict[str, Tensor]] | None = None - - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # process the experts separately - if name.find("experts") != -1: - n_experts = self.hparams.get("num_experts", self.hparams.get("moe_num_primary_experts")) - assert bid is not None - - if self._experts is None: - self._experts = [{} for _ in range(self.block_count)] - - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] - - # merge the experts into a single 3d tensor - for w_name in ["down", "gate", "up"]: - datas: list[Tensor] = [] - - for xid in range(n_experts): - ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{w_name}.weight" - datas.append(self._experts[bid][ename]) - del self._experts[bid][ename] - - data_torch = torch.stack(datas, dim=0) - - merged_name = f"model.layers.{bid}.block_sparse_moe.experts.{w_name}.weight" - - new_name = self.map_tensor_name(merged_name) - - tensors.append((new_name, data_torch)) - return tensors - else: - return [] - - return [(self.map_tensor_name(name), data_torch)] - - def prepare_tensors(self): - super().prepare_tensors() - - if self._experts is not None: - # flatten `list[dict[str, Tensor]]` into `list[str]` - experts = [k for d in self._experts for k in d.keys()] - if len(experts) > 0: - raise ValueError(f"Unprocessed experts: {experts}") - - class MistralModel(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA model_name = "Mistral" From 0ac6b75de66e273ad43d55ab095afe6204f0c4d1 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Tue, 29 Jul 2025 14:57:58 +0200 Subject: [PATCH 10/19] Remove duplication code --- convert_hf_to_gguf.py | 39 +-------------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0febfb3c09872..d992827e48638 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2040,44 +2040,7 @@ def set_vocab(self): self.gguf_writer.add_add_bos_token(False) def set_vocab_tekken(self): - vocab = gguf.vocab.MistralVocab(self.dir_model) - self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) - - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size, ( - f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" - ) - - if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken: - self.gguf_writer.add_tokenizer_pre("tekken") - self.gguf_writer.add_token_merges( - vocab.extract_vocab_merges_from_model() - ) - - logger.info( - f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." - ) - - self.gguf_writer.add_bos_token_id(vocab.bos_id) - self.gguf_writer.add_eos_token_id(vocab.eos_id) - self.gguf_writer.add_unk_token_id(vocab.unk_id) - self.gguf_writer.add_pad_token_id(vocab.pad_id) - - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_vocab_size(vocab.vocab_size) - - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(False) + self._set_vocab_mistral() script_dir = Path(__file__).parent template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja" From 025dd6e6bf5eabc4af02372ba86da7aab0f7eb8c Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Tue, 29 Jul 2025 15:06:57 +0200 Subject: [PATCH 11/19] Fixes --- convert_hf_to_gguf.py | 3 +-- convert_lora_to_gguf.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d992827e48638..5d806aa5cd228 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -28,7 +28,6 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf -from gguf.constants import MODEL_ARCH, MODEL_ARCH_NAMES from gguf.vocab import MistralTokenizerType, MistralVocab from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD @@ -4546,7 +4545,7 @@ class NomicBertModel(BertModel): def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): hparams = kwargs.pop("hparams", None) if hparams is None: - hparams = ModelBase.load_hparams(dir_model) + hparams = ModelBase.load_hparams(dir_model, False) self.is_moe = bool(hparams.get("moe_every_n_layers")) self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 00a6733cbd360..a67c0536a4128 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -340,7 +340,7 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: sys.exit(1) else: logger.info(f"Loading base model: {dir_base_model.name}") - hparams = ModelBase.load_hparams(dir_base_model) + hparams = ModelBase.load_hparams(dir_base_model, False) with torch.inference_mode(): try: From ba748700d6a661773772cf1800c139a8925061b9 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Tue, 29 Jul 2025 16:41:07 +0200 Subject: [PATCH 12/19] Fix flake issues --- convert_hf_to_gguf.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5d806aa5cd228..7c26eae3493a2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -22,9 +22,6 @@ import numpy as np import torch -if TYPE_CHECKING: - from torch import Tensor - if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf @@ -101,7 +98,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, type(self) is MmprojModel: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") - self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out @@ -898,7 +894,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: def _set_vocab_none(self) -> None: self.gguf_writer.add_tokenizer_model("none") - + def _set_vocab_mistral(self): vocab = MistralVocab(self.dir_model) logger.info( @@ -1282,7 +1278,7 @@ def __init__(self, *args, **kwargs): k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"] } self.n_embd_text = text_config.get("hidden_dim", 0) - + assert self.n_embd_text > 0, "n_embd not found in hparams" # move vision config to the top level, while preserving the original hparams in global_config @@ -2083,7 +2079,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter "patch_merger.", "pre_mm_projector_norm", ] - + is_multimodal_tensor = "vision_tower" in name \ or "vision_model" in name \ or "audio_tower" in name \ @@ -7847,7 +7843,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_spatial_merge_size( self.find_vparam(["spatial_merge_size"]) ) - + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: if name == "vision_language_adapter.w_in.weight": return "mm.1.weight" @@ -8112,7 +8108,7 @@ def main() -> None: if args.mmproj: if "mmproj" not in fname_out.name: fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") - + is_mistral_format = args.mistral_format with torch.inference_mode(): From 402f87e4701af57e11c5346ec3eacbcf4e743895 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Wed, 30 Jul 2025 10:36:31 +0200 Subject: [PATCH 13/19] Apply comments --- convert_hf_to_gguf.py | 124 +++++++++--------- gguf-py/gguf/utility.py | 4 - .../requirements-convert_hf_to_gguf.txt | 1 - 3 files changed, 64 insertions(+), 65 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7c26eae3493a2..acccc22a4d825 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -110,12 +110,17 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, if remote_hf_model_id is not None: self.is_safetensors = True - def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: - logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) - self.tensor_names = set(name for name in remote_tensors.keys()) - for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items(): - yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) + if not self.is_mistral_format: + def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: + logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) + self.tensor_names = set(name for name in remote_tensors.keys()) + for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items(): + yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) + else: + def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: + url = f"{gguf.utility.SafetensorRemote.BASE_DOMAIN}/{remote_hf_model_id}/resolve/main/consolidated.safetensors" + return gguf.utility.SafetensorRemote.get_list_tensors(url) self.get_tensors = get_remote_tensors else: @@ -514,10 +519,7 @@ def __init_subclass__(cls): raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") def set_vocab(self): - if self.is_mistral_format: - self._set_vocab_mistral() - else: - self._set_vocab_gpt2() + self._set_vocab_gpt2() def prepare_metadata(self, vocab_only: bool): super().prepare_metadata(vocab_only=vocab_only) @@ -895,50 +897,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: def _set_vocab_none(self) -> None: self.gguf_writer.add_tokenizer_model("none") - def _set_vocab_mistral(self): - vocab = MistralVocab(self.dir_model) - logger.info( - f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." - ) - - self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) - - tokens = [] - scores = [] - toktypes = [] - - for text, score, toktype in vocab.all_tokens(): - tokens.append(text) - scores.append(score) - toktypes.append(toktype) - - assert len(tokens) == vocab.vocab_size, ( - f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" - ) - - if vocab.tokenizer_type == MistralTokenizerType.tekken: - self.gguf_writer.add_tokenizer_pre("tekken") - self.gguf_writer.add_token_merges( - vocab.extract_vocab_merges_from_model() - ) - - logger.info( - f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." - ) - - self.gguf_writer.add_bos_token_id(vocab.bos_id) - self.gguf_writer.add_eos_token_id(vocab.eos_id) - self.gguf_writer.add_unk_token_id(vocab.unk_id) - self.gguf_writer.add_pad_token_id(vocab.pad_id) - - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) - self.gguf_writer.add_token_types(toktypes) - self.gguf_writer.add_vocab_size(vocab.vocab_size) - - self.gguf_writer.add_add_bos_token(True) - self.gguf_writer.add_add_eos_token(False) - def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -1992,10 +1950,53 @@ def __init__(self, *args, **kwargs): if self.hf_arch == "VLlama3ForCausalLM": self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) + def _set_vocab_mistral(self): + vocab = MistralVocab(self.dir_model) + logger.info( + f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." + ) + + self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) + + tokens = [] + scores = [] + toktypes = [] + + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size, ( + f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" + ) + + if vocab.tokenizer_type == MistralTokenizerType.tekken: + self.gguf_writer.add_tokenizer_pre("tekken") + self.gguf_writer.add_token_merges( + vocab.extract_vocab_merges_from_model() + ) + + logger.info( + f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." + ) + + self.gguf_writer.add_bos_token_id(vocab.bos_id) + self.gguf_writer.add_eos_token_id(vocab.eos_id) + self.gguf_writer.add_unk_token_id(vocab.unk_id) + self.gguf_writer.add_pad_token_id(vocab.pad_id) + + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + self.gguf_writer.add_vocab_size(vocab.vocab_size) + + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(False) + def set_vocab(self): if self.is_mistral_format: - self._set_vocab_mistral() - return + return self._set_vocab_mistral() path_tekken_json = self.dir_model / "tekken.json" path_tokenizer_json = self.dir_model / "tokenizer.json" @@ -2209,7 +2210,9 @@ def __init__(self, *args, **kwargs): self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") elif self.is_mistral_format: - self.hparams["layer_norm_eps"] = self.hparams.get("norm_eps", 1e-5) + # hparams is already vision config here so norm_eps is only defined in global_config. + self.hparams["norm_eps"] = self.global_config.get("norm_eps", None) + assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json" self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) else: raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") @@ -7819,7 +7822,7 @@ class MistralModel(LlamaModel): model_name = "Mistral" hf_arch = "" is_mistral_format = True - undo_permute = True + undo_permute = False class PixtralModel(LlavaVisionModel): @@ -7832,7 +7835,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) self.gguf_writer.add_vision_attention_layernorm_eps( - self.find_hparam(["layer_norm_eps"]) + self.find_hparam(["norm_eps"]) ) self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"])) @@ -8123,7 +8126,8 @@ def main() -> None: except NotImplementedError: logger.error(f"Model {model_architecture} is not supported") sys.exit(1) - elif args.mmproj and hparams.get("vision_encoder"): + elif args.mmproj: + assert hparams.get("vision_encoder") is not None, "This model does not support multimodal" model_class = PixtralModel else: model_class = MistralModel diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py index 7f275a5731ef2..769ccb02f0d91 100644 --- a/gguf-py/gguf/utility.py +++ b/gguf-py/gguf/utility.py @@ -120,13 +120,9 @@ def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]: """ # case 1: model has only one single model.safetensor file is_single_file = cls.check_file_exist(f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors") - is_single_file_consolidated = cls.check_file_exist(f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/consolidated.safetensors") if is_single_file: url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors" return cls.get_list_tensors(url) - if is_single_file_consolidated: - url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/consolidated.safetensors" - return cls.get_list_tensors(url) # case 2: model has multiple files index_url = f"{cls.BASE_DOMAIN}/{model_id}/resolve/main/model.safetensors.index.json" diff --git a/requirements/requirements-convert_hf_to_gguf.txt b/requirements/requirements-convert_hf_to_gguf.txt index 4da85fcf6051c..fd21ec479541f 100644 --- a/requirements/requirements-convert_hf_to_gguf.txt +++ b/requirements/requirements-convert_hf_to_gguf.txt @@ -1,7 +1,6 @@ mistral-common>=1.8.3 -r ./requirements-convert_legacy_llama.txt -mistral-common>=1.8.3 --extra-index-url https://download.pytorch.org/whl/cpu torch~=2.2.1; platform_machine != "s390x" From 332648454135b1306965fa6f8529b75dd9a7854c Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Wed, 30 Jul 2025 10:37:18 +0200 Subject: [PATCH 14/19] Apply comments --- convert_hf_to_gguf.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index acccc22a4d825..fb10cff6a77e7 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -22,15 +22,15 @@ import numpy as np import torch +if TYPE_CHECKING: + from torch import Tensor + if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf from gguf.vocab import MistralTokenizerType, MistralVocab from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD -if TYPE_CHECKING: - from torch import Tensor - logger = logging.getLogger("hf-to-gguf") From 3fa963f0d307babe025d9d3baebfe10a37e9b212 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Wed, 30 Jul 2025 10:41:23 +0200 Subject: [PATCH 15/19] Apply comments --- convert_hf_to_gguf.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index fb10cff6a77e7..6028668ba9ac6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -69,7 +69,6 @@ class ModelBase: lazy: bool part_names: list[str] is_safetensors: bool - is_mistral_format: bool hparams: dict[str, Any] tensor_names: set[str] | None gguf_writer: gguf.GGUFWriter @@ -91,8 +90,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, - small_first_shard: bool = False, hparams: dict[str, Any] | None = None, - remote_hf_model_id: str | None = None): + small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None): if type(self) is ModelBase or \ type(self) is TextModel or \ type(self) is MmprojModel: @@ -106,7 +104,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.use_temp_file = use_temp_file self.lazy = not eager or (remote_hf_model_id is not None) self.remote_hf_model_id = remote_hf_model_id - if remote_hf_model_id is not None: self.is_safetensors = True @@ -2001,7 +1998,13 @@ def set_vocab(self): path_tekken_json = self.dir_model / "tekken.json" path_tokenizer_json = self.dir_model / "tokenizer.json" if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): - return self.set_vocab_tekken() + self._set_vocab_mistral() + + script_dir = Path(__file__).parent + template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja" + with open(template_path, "r", encoding="utf-8") as f: + template = f.read() + self.gguf_writer.add_chat_template(template) try: self._set_vocab_sentencepiece() @@ -2033,16 +2036,7 @@ def set_vocab(self): # Apply to granite small models only if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) - - def set_vocab_tekken(self): - self._set_vocab_mistral() - - script_dir = Path(__file__).parent - template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja" - with open(template_path, "r", encoding="utf-8") as f: - template = f.read() - self.gguf_writer.add_chat_template(template) + self.gguf_writer.add_add_bos_token(False) def set_gguf_parameters(self): super().set_gguf_parameters() From 63002a0259aea2852e649ee294cba75002dab1cd Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Wed, 30 Jul 2025 11:01:22 +0200 Subject: [PATCH 16/19] Fix remote --- convert_hf_to_gguf.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 6028668ba9ac6..bd850b1d5efdc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -107,17 +107,19 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, if remote_hf_model_id is not None: self.is_safetensors = True - if not self.is_mistral_format: - def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: - logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") + def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: + logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") + + if not self.is_mistral_format: remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) - self.tensor_names = set(name for name in remote_tensors.keys()) - for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items(): - yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) - else: - def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: + + else: url = f"{gguf.utility.SafetensorRemote.BASE_DOMAIN}/{remote_hf_model_id}/resolve/main/consolidated.safetensors" - return gguf.utility.SafetensorRemote.get_list_tensors(url) + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors(url) + + self.tensor_names = set(name for name in remote_tensors.keys()) + for name, remote_tensor in remote_tensors.items(): + yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) self.get_tensors = get_remote_tensors else: From 42489f53ece038258eb0d27467f136a858e84aca Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Wed, 30 Jul 2025 14:55:02 +0200 Subject: [PATCH 17/19] add default chat template --- convert_hf_to_gguf.py | 59 +++++++++++++++++++++++++++++++++---------- gguf-py/gguf/vocab.py | 2 +- 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bd850b1d5efdc..f21746d51ec61 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -29,7 +29,12 @@ sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf from gguf.vocab import MistralTokenizerType, MistralVocab +from mistral_common.tokens.tokenizers.base import TokenizerVersion from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD +from mistral_common.tokens.tokenizers.tekken import Tekkenizer +from mistral_common.tokens.tokenizers.sentencepiece import ( + SentencePieceTokenizer, +) logger = logging.getLogger("hf-to-gguf") @@ -110,13 +115,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") - if not self.is_mistral_format: - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) - - else: - url = f"{gguf.utility.SafetensorRemote.BASE_DOMAIN}/{remote_hf_model_id}/resolve/main/consolidated.safetensors" - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors(url) - + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) self.tensor_names = set(name for name in remote_tensors.keys()) for name, remote_tensor in remote_tensors.items(): yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) @@ -1993,6 +1992,11 @@ def _set_vocab_mistral(self): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(False) + template_dir = Path(__file__).parent / "models/templates/" + + template = MistralModel.get_community_chat_template(vocab, template_dir) + self.gguf_writer.add_chat_template(template) + def set_vocab(self): if self.is_mistral_format: return self._set_vocab_mistral() @@ -2002,12 +2006,6 @@ def set_vocab(self): if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): self._set_vocab_mistral() - script_dir = Path(__file__).parent - template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja" - with open(template_path, "r", encoding="utf-8") as f: - template = f.read() - self.gguf_writer.add_chat_template(template) - try: self._set_vocab_sentencepiece() except FileNotFoundError: @@ -2038,7 +2036,7 @@ def set_vocab(self): # Apply to granite small models only if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_add_bos_token(False) def set_gguf_parameters(self): super().set_gguf_parameters() @@ -7820,6 +7818,39 @@ class MistralModel(LlamaModel): is_mistral_format = True undo_permute = False + @staticmethod + def get_community_chat_template(vocab: MistralVocab, templates_dir: Path): + assert TokenizerVersion is not None, "mistral_common is not installed" + assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), ( + f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}" + ) + + if vocab.tokenizer.version == TokenizerVersion.v1: + return "mistral-v1" + elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm: + return "mistral-v3" + elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken: + return "mistral-v3-tekken" + elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm: + return "mistral-v7" + elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken: + return "mistral-v7-tekken" + elif vocab.tokenizer.version == TokenizerVersion.v11: + template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja" + elif vocab.tokenizer.version == TokenizerVersion.v13: + template_file = "unsloth-mistral-Devstral-Small-2507.jinja" + else: + raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type}") + + template_path = templates_dir / template_file + if not template_path.exists(): + raise FileNotFoundError(f"Template file not found: {template_path}") + + with open(template_path, "r", encoding="utf-8") as f: + template = f.read() + + return template + class PixtralModel(LlavaVisionModel): model_name = "Pixtral" diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index e1d5aaf47ac46..797659dee0b58 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -25,8 +25,8 @@ except ImportError: _mistral_common_installed = False MistralTokenizer = None - Tekkenizer = None SentencePieceTokenizer = None + Tekkenizer = None _filter_valid_tokenizer_files = None else: _mistral_common_installed = True From 467ccd25f09912b0a078de0c70e0c1df07846deb Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Wed, 30 Jul 2025 14:56:20 +0200 Subject: [PATCH 18/19] Revert --- gguf-py/gguf/vocab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 797659dee0b58..e1d5aaf47ac46 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -25,8 +25,8 @@ except ImportError: _mistral_common_installed = False MistralTokenizer = None - SentencePieceTokenizer = None Tekkenizer = None + SentencePieceTokenizer = None _filter_valid_tokenizer_files = None else: _mistral_common_installed = True From 9493ced7c494135453ec25ace466a22dd0a7ee27 Mon Sep 17 00:00:00 2001 From: Julien Denize Date: Wed, 30 Jul 2025 15:01:28 +0200 Subject: [PATCH 19/19] nit --- convert_hf_to_gguf.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f21746d51ec61..cde2e79c7203a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -114,7 +114,6 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) self.tensor_names = set(name for name in remote_tensors.keys()) for name, remote_tensor in remote_tensors.items(): @@ -7840,7 +7839,7 @@ def get_community_chat_template(vocab: MistralVocab, templates_dir: Path): elif vocab.tokenizer.version == TokenizerVersion.v13: template_file = "unsloth-mistral-Devstral-Small-2507.jinja" else: - raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type}") + raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}") template_path = templates_dir / template_file if not template_path.exists():