Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ def _compile(
"mdp_ts_json": mdp_ts_json,
"num_speculative_tokens": num_speculative_tokens,
}

compile_hash = hash_dict_params(compile_hash_params)

compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash)
Expand Down
8 changes: 8 additions & 0 deletions QEfficient/transformers/embeddings/embedding_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@ def forward(
return self.pooling_fn(output[0], attention_mask)


def embedding_forward(
self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs
):
print("Forward swapped with new one")
output = self.old_forward(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
return output[0]


def validate_user_pooling_function(user_function):
"""
Validate a user-provided pooling function to ensure it meets the required interface.
Expand Down
12 changes: 7 additions & 5 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,16 @@ class QEFFAutoModel(QEFFTransformersBase):
_pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
_onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]

def __init__(self, model: nn.Module, pooling=None, **kwargs):
def __init__(self, model: nn.Module, **kwargs):
super().__init__(model, **kwargs)

# Make Embedding specific transforms like appending pooling
if pooling:
self.model, _ = PoolingTransform.apply(self.model, pooling)
if kwargs["pooling"]:
self.model, _ = PoolingTransform.apply(self.model, kwargs["pooling"])
# else:
# self.model, _ = EmbeddingTransform.apply(self.model)

self.model.base_model.config.use_cache = True
# self.model.base_model.config.use_cache = True

self.hash_params["qeff_auto_class"] = self.__class__.__name__

Expand Down Expand Up @@ -396,7 +398,7 @@ def cloud_ai_100_feature_generate(
outputs = self.qpc_session.run(inputs)
except Exception:
outputs = {
"output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[1]).astype(
"output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[0]).astype(
np.float32
),
}
Expand Down
18 changes: 17 additions & 1 deletion QEfficient/transformers/models/pytorch_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,12 @@

from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform
from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC
from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function
from QEfficient.transformers.embeddings.embedding_utils import (
POOLING_MAP,
PooledModel,
embedding_forward,
validate_user_pooling_function,
)
from QEfficient.transformers.models.codegen.modeling_codegen import (
QEffCodeGenAttention,
QeffCodeGenBlock,
Expand Down Expand Up @@ -634,3 +639,14 @@ def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Modu
model = PooledModel(model, pooling_method)
warnings.warn("Pooling is applied to the model.")
return model, transformed


class EmbeddingTransform:
@classmethod
def apply(cls, model: nn.Module, qaic_config: Optional[dict] = None, **kwargs) -> Tuple[nn.Module, bool]:
transformed = False
model.old_forward = model.forward
model.forward = MethodType(embedding_forward, model)
transformed = True

return model, transformed
5 changes: 3 additions & 2 deletions QEfficient/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32
ONNX_EXPORT_EXAMPLE_FBS = 4
ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep
ONNX_EXPORT_OPSET = 13
ONNX_EXPORT_OPSET = 18
ONNX_EXPORT_MAX_NUM_IMAGES = 1
ONNX_EXPORT_MAX_IMAGE_TILES = 4
ONNX_EXPORT_IMAGE_WIDTH = 560
Expand All @@ -41,6 +41,7 @@
"pretrained_model_name_or_path",
"attn_implementation",
"_attn_implementation",
"pooling",
]

# Minimum value for causal mask
Expand Down Expand Up @@ -83,7 +84,7 @@ def get_models_dir():
ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS = 512
ONNX_EXPORT_EXAMPLE_TOP_PS = 0.80
ONNX_EXPORT_EXAMPLE_MIN_PS = 0.99
ONNX_EXPORT_OPSET = 13
ONNX_EXPORT_OPSET = 18

COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]

Expand Down
4 changes: 2 additions & 2 deletions examples/embedding_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor)
sentences = "This is an example sentence"

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-mistral-7b-instruct")


# You can specify the pooling strategy either as a string (e.g., "max") or by passing a custom pooling function.
# If no pooling is specified, the model will return its default output (typically token embeddings).
qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling=max_pooling)
qeff_model = AutoModel.from_pretrained("intfloat/e5-mistral-7b-instruct", num_hidden_layers=1, pooling=max_pooling)
# qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="max")
# qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

Expand Down
64 changes: 64 additions & 0 deletions tests/transformers/models/llama_org_2l.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from transformers import AutoModelForCausalLM

from QEfficient import QEFFAutoModelForCausalLM
from QEfficient.utils.run_utils import ApiRunner
from QEfficient.utils._utils import create_json, load_hf_tokenizer

Check failure on line 5 in tests/transformers/models/llama_org_2l.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F401)

tests/transformers/models/llama_org_2l.py:5:37: F401 `QEfficient.utils._utils.create_json` imported but unused
from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers

Check failure on line 6 in tests/transformers/models/llama_org_2l.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F401)

tests/transformers/models/llama_org_2l.py:6:53: F401 `QEfficient.transformers.quantizers.auto.replace_transformers_quantizers` imported but unused

from QEfficient.utils import hf_download

Check failure on line 8 in tests/transformers/models/llama_org_2l.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (I001)

tests/transformers/models/llama_org_2l.py:1:1: I001 Import block is un-sorted or un-formatted

def load_causal_lm_model(model_config):
"""
Function to load model from huggingface and transform to KV model
--------

:model_config: Dict

:return model_hf, params
"""
model_path = hf_download(
repo_id=model_config["model_name"],
ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
)
model_hf = AutoModelForCausalLM.from_pretrained(
model_path,
use_cache=True,
num_hidden_layers=model_config["n_layer"],
attn_implementation="eager",
low_cpu_mem_usage=False,
) # Run models for single layers only
# params = sum(p.numel() for p in model_hf.parameters())
params=""
# model_hf.eval()
return model_hf, params


def check_llama_onnx():
MODEL_ID = "meta-llama/Llama-3.2-1B"
# replace_transformers_quantizers()
tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=MODEL_ID)
# config = AutoConfig.from_pretrained(MODEL_ID)
# config.num_hidden_layers = 1
model_config = {"model_name": MODEL_ID}
model_config["n_layer"] = 2

model_hf, _ = load_causal_lm_model(model_config)
config = model_hf.config

Check failure on line 46 in tests/transformers/models/llama_org_2l.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F841)

tests/transformers/models/llama_org_2l.py:46:5: F841 Local variable `config` is assigned to but never used
api_runner = ApiRunner(
1,
tokenizer,
model_hf.config,
"Where is the Thomas J. Watson Research Center located?",
32, #prompt_len
64, #ctx_len

)
pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)

Check failure on line 56 in tests/transformers/models/llama_org_2l.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F841)

tests/transformers/models/llama_org_2l.py:56:5: F841 Local variable `pytorch_hf_tokens` is assigned to but never used
qeff_model = QEFFAutoModelForCausalLM(model_hf)
pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)

Check failure on line 58 in tests/transformers/models/llama_org_2l.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F841)

tests/transformers/models/llama_org_2l.py:58:5: F841 Local variable `pytorch_kv_tokens` is assigned to but never used
onnx_model_path = qeff_model.export()
ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=False)

Check failure on line 60 in tests/transformers/models/llama_org_2l.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F841)

tests/transformers/models/llama_org_2l.py:60:5: F841 Local variable `ort_tokens` is assigned to but never used

if __name__ == "__main__":
# run()
check_llama_onnx()
11 changes: 9 additions & 2 deletions tests/transformers/models/test_embedding_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,16 @@
embed_test_models = [
{"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"},
{"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"},
{"model_name": "intfloat/e5-mistral-7b-instruct", "pooling":"cls"},
{"model_name": "sentence-transformers/multi-qa-mpnet-base-cos-v1"},
{"model_name": "intfloat/e5-mistral-7b-instruct", "pooling": "cls"},
{"model_name": "nomic-ai/nomic-embed-text-v1.5", "pooling": "cls"},
{"model_name": "NovaSearch/stella_en_1.5B_v5", "pooling": "cls"},
{"model_name": "ibm-granite/granite-embedding-30m-english", "pooling": "cls"},
{"model_name": "BAAI/bge-reranker-v2-m3", "pooling": "cls"},
{"model_name": "ibm-granite/granite-embedding-107m-multilingual", "pooling": "cls"},
]


def check_embed_pytorch_vs_ort_vs_ai100(
model_name: str,
seq_len: int = Constants.CTX_LEN,
Expand Down Expand Up @@ -119,7 +126,7 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model):


@pytest.mark.on_qaic
@pytest.mark.parametrize("model", embed_test_models[:1])
@pytest.mark.parametrize("model", embed_test_models)
def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model):
"""
Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.
Expand Down
Loading