diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 0b07bb6b3..aaf1cbb6c 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -313,6 +313,7 @@ def _compile( "mdp_ts_json": mdp_ts_json, "num_speculative_tokens": num_speculative_tokens, } + compile_hash = hash_dict_params(compile_hash_params) compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash) diff --git a/QEfficient/transformers/embeddings/embedding_utils.py b/QEfficient/transformers/embeddings/embedding_utils.py index dd68e5fb9..f1e099098 100644 --- a/QEfficient/transformers/embeddings/embedding_utils.py +++ b/QEfficient/transformers/embeddings/embedding_utils.py @@ -98,6 +98,14 @@ def forward( return self.pooling_fn(output[0], attention_mask) +def embedding_forward( + self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs +): + print("Forward swapped with new one") + output = self.old_forward(input_ids=input_ids, attention_mask=attention_mask, **kwargs) + return output[0] + + def validate_user_pooling_function(user_function): """ Validate a user-provided pooling function to ensure it meets the required interface. diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 3e50a2783..3426bb5aa 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -161,14 +161,16 @@ class QEFFAutoModel(QEFFTransformersBase): _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform] _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform] - def __init__(self, model: nn.Module, pooling=None, **kwargs): + def __init__(self, model: nn.Module, **kwargs): super().__init__(model, **kwargs) # Make Embedding specific transforms like appending pooling - if pooling: - self.model, _ = PoolingTransform.apply(self.model, pooling) + if kwargs["pooling"]: + self.model, _ = PoolingTransform.apply(self.model, kwargs["pooling"]) + # else: + # self.model, _ = EmbeddingTransform.apply(self.model) - self.model.base_model.config.use_cache = True + # self.model.base_model.config.use_cache = True self.hash_params["qeff_auto_class"] = self.__class__.__name__ @@ -396,7 +398,7 @@ def cloud_ai_100_feature_generate( outputs = self.qpc_session.run(inputs) except Exception: outputs = { - "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[1]).astype( + "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[0]).astype( np.float32 ), } diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py index ca74c0ddd..909f54dcc 100644 --- a/QEfficient/transformers/models/pytorch_transforms.py +++ b/QEfficient/transformers/models/pytorch_transforms.py @@ -154,7 +154,12 @@ from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC -from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function +from QEfficient.transformers.embeddings.embedding_utils import ( + POOLING_MAP, + PooledModel, + embedding_forward, + validate_user_pooling_function, +) from QEfficient.transformers.models.codegen.modeling_codegen import ( QEffCodeGenAttention, QeffCodeGenBlock, @@ -634,3 +639,14 @@ def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Modu model = PooledModel(model, pooling_method) warnings.warn("Pooling is applied to the model.") return model, transformed + + +class EmbeddingTransform: + @classmethod + def apply(cls, model: nn.Module, qaic_config: Optional[dict] = None, **kwargs) -> Tuple[nn.Module, bool]: + transformed = False + model.old_forward = model.forward + model.forward = MethodType(embedding_forward, model) + transformed = True + + return model, transformed diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index cc52658c6..d8b5da8c2 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -17,7 +17,7 @@ ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32 ONNX_EXPORT_EXAMPLE_FBS = 4 ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep -ONNX_EXPORT_OPSET = 13 +ONNX_EXPORT_OPSET = 18 ONNX_EXPORT_MAX_NUM_IMAGES = 1 ONNX_EXPORT_MAX_IMAGE_TILES = 4 ONNX_EXPORT_IMAGE_WIDTH = 560 @@ -41,6 +41,7 @@ "pretrained_model_name_or_path", "attn_implementation", "_attn_implementation", + "pooling", ] # Minimum value for causal mask @@ -83,7 +84,7 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS = 512 ONNX_EXPORT_EXAMPLE_TOP_PS = 0.80 ONNX_EXPORT_EXAMPLE_MIN_PS = 0.99 -ONNX_EXPORT_OPSET = 13 +ONNX_EXPORT_OPSET = 18 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"] diff --git a/examples/embedding_model.py b/examples/embedding_model.py index 7e6973e2e..8960b8f71 100644 --- a/examples/embedding_model.py +++ b/examples/embedding_model.py @@ -24,12 +24,12 @@ def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor) sentences = "This is an example sentence" # Load model from HuggingFace Hub -tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") +tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-mistral-7b-instruct") # You can specify the pooling strategy either as a string (e.g., "max") or by passing a custom pooling function. # If no pooling is specified, the model will return its default output (typically token embeddings). -qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling=max_pooling) +qeff_model = AutoModel.from_pretrained("intfloat/e5-mistral-7b-instruct", num_hidden_layers=1, pooling=max_pooling) # qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="max") # qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") diff --git a/tests/transformers/models/llama_org_2l.py b/tests/transformers/models/llama_org_2l.py new file mode 100644 index 000000000..e019ff4a6 --- /dev/null +++ b/tests/transformers/models/llama_org_2l.py @@ -0,0 +1,64 @@ +from transformers import AutoModelForCausalLM + +from QEfficient import QEFFAutoModelForCausalLM +from QEfficient.utils.run_utils import ApiRunner +from QEfficient.utils._utils import create_json, load_hf_tokenizer +from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers + +from QEfficient.utils import hf_download + +def load_causal_lm_model(model_config): + """ + Function to load model from huggingface and transform to KV model + -------- + + :model_config: Dict + + :return model_hf, params + """ + model_path = hf_download( + repo_id=model_config["model_name"], + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + model_hf = AutoModelForCausalLM.from_pretrained( + model_path, + use_cache=True, + num_hidden_layers=model_config["n_layer"], + attn_implementation="eager", + low_cpu_mem_usage=False, + ) # Run models for single layers only + # params = sum(p.numel() for p in model_hf.parameters()) + params="" + # model_hf.eval() + return model_hf, params + + +def check_llama_onnx(): + MODEL_ID = "meta-llama/Llama-3.2-1B" + # replace_transformers_quantizers() + tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=MODEL_ID) + # config = AutoConfig.from_pretrained(MODEL_ID) + # config.num_hidden_layers = 1 + model_config = {"model_name": MODEL_ID} + model_config["n_layer"] = 2 + + model_hf, _ = load_causal_lm_model(model_config) + config = model_hf.config + api_runner = ApiRunner( + 1, + tokenizer, + model_hf.config, + "Where is the Thomas J. Watson Research Center located?", + 32, #prompt_len + 64, #ctx_len + + ) + pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) + qeff_model = QEFFAutoModelForCausalLM(model_hf) + pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) + onnx_model_path = qeff_model.export() + ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=False) + +if __name__ == "__main__": + # run() + check_llama_onnx() \ No newline at end of file diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 2d110faeb..fbbab7f82 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -22,9 +22,16 @@ embed_test_models = [ {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"}, {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"}, + {"model_name": "intfloat/e5-mistral-7b-instruct", "pooling":"cls"}, + {"model_name": "sentence-transformers/multi-qa-mpnet-base-cos-v1"}, + {"model_name": "intfloat/e5-mistral-7b-instruct", "pooling": "cls"}, + {"model_name": "nomic-ai/nomic-embed-text-v1.5", "pooling": "cls"}, + {"model_name": "NovaSearch/stella_en_1.5B_v5", "pooling": "cls"}, + {"model_name": "ibm-granite/granite-embedding-30m-english", "pooling": "cls"}, + {"model_name": "BAAI/bge-reranker-v2-m3", "pooling": "cls"}, + {"model_name": "ibm-granite/granite-embedding-107m-multilingual", "pooling": "cls"}, ] - def check_embed_pytorch_vs_ort_vs_ai100( model_name: str, seq_len: int = Constants.CTX_LEN, @@ -119,7 +126,7 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model): @pytest.mark.on_qaic -@pytest.mark.parametrize("model", embed_test_models[:1]) +@pytest.mark.parametrize("model", embed_test_models) def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model): """ Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.