Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 17 additions & 20 deletions QEfficient/base/modeling_qeff.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pathlib import Path
from typing import Dict, List, Optional

from QEfficient.transformers.modeling_utils import SPECIALIZED_DISAGG_SERVING_MODEL_ARCH
import onnx
import torch

Expand Down Expand Up @@ -240,7 +241,7 @@ def _export(

# Return early if ONNX already exists
if onnx_path.is_file():
if prefill_only:
if prefill_only and self.model.config.model_type in SPECIALIZED_DISAGG_SERVING_MODEL_ARCH:
self.prefill_onnx_path = onnx_path
else:
self.onnx_path = onnx_path
Expand Down Expand Up @@ -322,7 +323,7 @@ def _export(
finally:
shutil.rmtree(tmp_onnx_dir, ignore_errors=True)

if prefill_only:
if prefill_only and self.model.config.model_type in SPECIALIZED_DISAGG_SERVING_MODEL_ARCH:
self.prefill_onnx_path = onnx_path
else:
self.onnx_path = onnx_path
Expand All @@ -342,15 +343,16 @@ def get_onnx_path(
"use_onnx_subfunctions": use_onnx_subfunctions,
"retain_full_kv": retain_full_kv,
}
if prefill_only:
if self.prefill_onnx_path is None:
kwargs.update(
{
"prefill_only": prefill_only,
"prefill_seq_len": specializations[0].get("seq_len"),
"enable_chunking": enable_chunking,
}
)

if self.model.config.model_type in SPECIALIZED_DISAGG_SERVING_MODEL_ARCH:
kwargs.update(
{
"prefill_only": prefill_only,
"prefill_seq_len": specializations[0].get("seq_len"),
"enable_chunking": enable_chunking,
}
)
if prefill_only and self.prefill_onnx_path is None:
self.export(**kwargs)
return self.prefill_onnx_path
else:
Expand Down Expand Up @@ -467,6 +469,10 @@ def _compile(
else:
mdp_ts_json = None

if use_onnx_subfunctions:
logger.info("Using ONNX subfunctions for compilation.")
command.append("-sub-functions")

compile_hash_params = {
"command": command,
"specializations": specializations,
Expand Down Expand Up @@ -514,16 +520,7 @@ def _compile(

command.append(f"-aic-binary-dir={qpc_path}")
logger.info(f"Running compiler: {' '.join(command)}")
if use_onnx_subfunctions:

class FeatureNotAvailableError(Exception):
pass

exec_command = f'QAIC_COMPILER_OPTS_UNSUPPORTED="-loader-inline-all=0" {" ".join(command)}'
raise FeatureNotAvailableError(
"ONNX graph is exported with subfunctions, assert version of apps SDK should be used for compiling this model."
+ f"\nRun following command manually with assert compiler:\n{exec_command}"
)
try:
subprocess.run(command, capture_output=True, check=True)
except subprocess.CalledProcessError as e:
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/diffusers/pipelines/flux/pipeline_flux.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def compile(
self.export(use_onnx_subfunctions=use_onnx_subfunctions)

# Load compilation configuration
config_manager(self, config_source=compile_config)
config_manager(self, config_source=compile_config, use_onnx_subfunctions=use_onnx_subfunctions)

# Calculate compressed latent dimension using utility function
cl, latent_height, latent_width = calculate_compressed_latent_dimension(
Expand Down
7 changes: 6 additions & 1 deletion QEfficient/diffusers/pipelines/pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def calculate_latent_dimensions_with_frames(
return cl, latent_height, latent_width, latent_frames


def config_manager(cls, config_source: Optional[str] = None):
def config_manager(cls, config_source: Optional[str] = None, use_onnx_subfunctions: bool = False):
"""
JSON-based compilation configuration manager for diffusion pipelines.

Expand All @@ -109,6 +109,11 @@ def config_manager(cls, config_source: Optional[str] = None):

cls.custom_config = load_json(config_source)

# Enable ONNX subfunctions for specific modules if requested
for module_name, _ in cls.modules.items():
if module_name in ONNX_SUBFUNCTION_MODULE:
cls.custom_config["modules"][module_name]["compilation"]["use_onnx_subfunctions"] = use_onnx_subfunctions


def set_module_device_ids(cls):
"""
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/diffusers/pipelines/wan/pipeline_wan.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def compile(
self.export(use_onnx_subfunctions=use_onnx_subfunctions)

# Load compilation configuration
config_manager(self, config_source=compile_config)
config_manager(self, config_source=compile_config, use_onnx_subfunctions=use_onnx_subfunctions)

# Configure pipeline dimensions and calculate compressed latent parameters
cl, latent_height, latent_width, latent_frames = calculate_latent_dimensions_with_frames(
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@
DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH = {"gemma3", "llama4", "gemma3_text", "llama4_text"}

# This is for supporting different modelling classes specially written for prefill-only model
SPECIALIZED_PREFILL_ONLY_MODEL_ARCH = {"gpt_oss"}
SPECIALIZED_DISAGG_SERVING_MODEL_ARCH = {"gpt_oss"}

# Define a transformers layers to QEff layers dictionary
# While onboarding new models make sure to add the new layer maps to this dictionary.
Expand Down
76 changes: 34 additions & 42 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
from QEfficient.generation.vlm_generation import VisionLanguageGeneration
from QEfficient.transformers.modeling_utils import (
DYNAMIC_SEQ_LEN_SUPPORTED_MODEL_ARCH,
SPECIALIZED_PREFILL_ONLY_MODEL_ARCH,
SPECIALIZED_DISAGG_SERVING_MODEL_ARCH,
)
from QEfficient.transformers.models.pytorch_transforms import (
BlockedKVAttentionTransform,
Expand Down Expand Up @@ -2522,7 +2522,7 @@ def get_seq_len_and_handle_specialized_prefill_model(

num_q_blocks = os.environ.get("NUM_Q_BLOCKS", None)
if num_q_blocks is None:
block_size = 128
block_size = 256
if prefill_seq_len is None or prefill_seq_len % block_size != 0 or prefill_seq_len < 128:
raise ValueError(
f"When prefill_only=True, 'prefill_seq_len' must be explicitly set and divisible by block_size={block_size}. "
Expand Down Expand Up @@ -2588,31 +2588,28 @@ def export(
self.model.config, fbs if self.continuous_batching else bs, seq_len
)
enable_chunking = kwargs.get("enable_chunking", False)
if prefill_only:
if not enable_chunking and self.continuous_batching:
raise NotImplementedError(
"Looks like you are trying to run prefix-caching without chunking, this feature is not available yet!"
)
self.prefill(enable=True, enable_chunking=enable_chunking)
self.hash_params.pop("retain_full_kv", None)
seq_len = (
self.get_seq_len_and_handle_specialized_prefill_model(

# TODO: move this to a DA Serving utility class
if self.model.config.model_type in SPECIALIZED_DISAGG_SERVING_MODEL_ARCH:
if prefill_only:
if self.continuous_batching and not enable_chunking:
raise NotImplementedError("Can't enable prefix-caching without chunking")
self.prefill(enable=True, enable_chunking=enable_chunking)
self.hash_params.pop("retain_full_kv", None)
seq_len = self.get_seq_len_and_handle_specialized_prefill_model(
prefill_seq_len=prefill_seq_len, enable_chunking=enable_chunking
)
if self.model.config.model_type in SPECIALIZED_PREFILL_ONLY_MODEL_ARCH
else seq_len
)
kv_cache_shape[2] = seq_len + self.model.config.sliding_window if enable_chunking else seq_len
else:
self.prefill(False, retain_full_kv=kwargs.get("retain_full_kv", False))
self.hash_params.pop("prefill_only", None)
self.hash_params.pop("NUM_Q_BLOCKS", None)
self.hash_params.pop("NUM_FFN_BLOCKS", None)
self.hash_params.pop("ENABLE_OPT_SWA", None)
self.hash_params.pop("chunking", None)
if kwargs.get("retain_full_kv", False):
kv_cache_shape[2] = seq_len + self.model.config.sliding_window
self.hash_params["retain_full_kv"] = True
kv_cache_shape[2] = seq_len + self.model.config.sliding_window if enable_chunking else seq_len
else:
self.prefill(False, retain_full_kv=kwargs.get("retain_full_kv", False))
self.hash_params.pop("prefill_only", None)
self.hash_params.pop("NUM_Q_BLOCKS", None)
self.hash_params.pop("NUM_FFN_BLOCKS", None)
self.hash_params.pop("ENABLE_OPT_SWA", None)
self.hash_params.pop("chunking", None)
if kwargs.get("retain_full_kv", False):
kv_cache_shape[2] = seq_len + self.model.config.sliding_window
self.hash_params["retain_full_kv"] = True

example_inputs = {
"input_ids": torch.zeros((bs, seq_len), dtype=torch.int64),
Expand Down Expand Up @@ -2933,20 +2930,23 @@ def compile(
If `prefill_seq_len` is less than `num_speculative_tokens + 1` for TLM models.

"""
if (kv_cache_batch_size or full_batch_size) and not self.continuous_batching:
logger.warning(
"`kv_cache_batch_size` or `full_batch_size` is being passed"
"This will be ignored as `continuous_batching` is set to `False` in `from_pretrained`"
)

if prefill_only is None or not prefill_only:
if self.continuous_batching and full_batch_size is None:
raise TypeError("`full_batch_size` is required when `continuous_batching=True`.")
if kv_cache_batch_size and not full_batch_size:
else:
if self.continuous_batching and kv_cache_batch_size is None and full_batch_size is None:
raise ValueError(
"KV caching requires continuous batching. Please set `full_batch_size` and "
"enable `continuous_batching=True` in `from_pretrained`."
"Please pass valid integer for kv_cache_batch_size or full_batch_size, both have same meaning, as continuous_batching is enabled for prefill-only model"
)
else:
if self.continuous_batching:
if not isinstance(kv_cache_batch_size, int):
raise ValueError(
"Please pass valid integer for kv_cache_batch_size as continuous_batching is enabled for prefill-only model"
)

# Infer kv_cache_batch_size if not provided
kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size

# if ccl_enabled is True read Compute-Context-Length lists
if self.ccl_enabled:
Expand Down Expand Up @@ -2989,14 +2989,6 @@ def compile(
):
raise ValueError("Currently, sampler does not support `num_speculative_tokens` > 0.")

if kv_cache_batch_size and prefill_only is not None and prefill_only:
logger.warning(
"kv_cache_batch_size will be ignored as prefill_only is set to True unless this is GPTOSS model"
)

# Infer kv_cache_batch_size if not provided
kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size

# --- Specializations ---
specializations = []
if prefill_only is None or prefill_only or prefill_seq_len == 1:
Expand Down
4 changes: 3 additions & 1 deletion QEfficient/utils/export_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,9 @@ def _setup_onnx_subfunctions(qeff_model, args, kwargs):
qeff_model._onnx_transforms.append(CustomOpTransform)

# TODO: Handle this in the modelling class QEFFTransformersBase,remove from here. Refer diffusers implementation
kwargs["export_modules_as_functions"] = get_decoder_layer_classes_for_export(qeff_model.model)
decoder_layer_classes = get_decoder_layer_classes_for_export(qeff_model.model)
if decoder_layer_classes:
kwargs["export_modules_as_functions"] = decoder_layer_classes
return args, kwargs


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#
# -----------------------------------------------------------------------------

import os
import time

import numpy as np
Expand All @@ -14,7 +15,11 @@
from QEfficient import QEFFAutoModelForCausalLM
from QEfficient.generation.cloud_infer import QAICInferenceSession

model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32
dir_path = os.path.dirname(os.path.realpath(__file__))
subfunc_npi_file_path = os.path.join(dir_path, "subfunction_120b_npi.yaml")
non_subfunc_npi_file_path = os.path.join(dir_path, "non_subfunction_120b_npi.yaml")

model_id = "openai/gpt-oss-120b" # weights are not required to convert to fp32

prompt = """
Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures.
Expand All @@ -27,7 +32,7 @@
config = AutoConfig.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
PREFILL_SEQ_LEN = 128
CTX_LEN = 128 * 3
CTX_LEN = 8192

qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id)

Expand All @@ -43,6 +48,8 @@
num_speculative_tokens=None,
offload_pt_weights=False, # Need the weights in memory for prefill-model export/compilation in the next step
retain_full_kv=True,
# split_retained_state_io=True, # This should be used for disagg serving via VLLM
node_precision_info=non_subfunc_npi_file_path,
)


Expand All @@ -61,6 +68,8 @@
prefill_only=True,
enable_chunking=True,
use_onnx_subfunctions=True,
# split_retained_state_io=True, # This should be used for disagg serving via VLLM
node_precision_info=subfunc_npi_file_path,
)


Expand Down
20 changes: 16 additions & 4 deletions tests/transformers/test_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,17 @@ def test_causal_lm_export_and_hash(config, cb, tmp_path):


@pytest.mark.parametrize("cb", [False, True], ids=["nocb", "cb"])
@pytest.mark.parametrize("subfunc", [False, True], ids=["False", "True"])
@pytest.mark.parametrize("subfunc", [False, True], ids=["non-subfunc", "subfunc"])
@pytest.mark.parametrize("prefill_only", [False, True], ids=["pref+decode", "prefill-only"])
@pytest.mark.parametrize("config", configs, ids=config_ids)
def test_causal_lm_hash_creation(config, cb, subfunc, tmp_path):
def test_causal_lm_hash_creation(config, cb, subfunc, prefill_only, tmp_path):
if config.model_type == "gpt_oss" and prefill_only:
pytest.skip(
"gpt_oss prefill_only mode has different logic to create hash as we have two different ONNX for prefill/decode for this model for disagg serving"
)
model = AutoModelForCausalLM.from_config(config, **model_kwargs)
qeff_model = QEFFAutoModelForCausalLM(model, cb)
qeff_model.export(tmp_path, use_onnx_subfunctions=subfunc)
qeff_model.export(tmp_path, use_onnx_subfunctions=subfunc, prefill_only=prefill_only)
hash_params = {}
hash_params["config"] = qeff_model.model.config.to_diff_dict()
hash_params["peft_config"] = None
Expand Down Expand Up @@ -251,12 +256,19 @@ def tmp_cache(tmp_path, monkeypatch):
yield tmp_path


@pytest.mark.parametrize("prefill_only", [False, True], ids=["pref+decode", "prefill_only"])
@pytest.mark.parametrize("cb", [False, True], ids=["nocb", "cb"])
@pytest.mark.parametrize("config", configs, ids=config_ids)
def test_causal_lm_compile(config, cb, tmp_cache):
def test_causal_lm_compile(config, cb, prefill_only, tmp_cache):
if config.model_type == "gpt_oss":
pytest.skip(
"gpt_oss prefill_only mode has different logic to create hash as we have two different ONNX for prefill/decode for this model for disagg serving"
)
model = AutoModelForCausalLM.from_config(config, **model_kwargs)
qeff_model = QEFFAutoModelForCausalLM(model, cb)
compile_params = {"prefill_seq_len": 8, "ctx_len": 16}
if prefill_only:
compile_params["prefill_only"] = True
if cb:
compile_params["full_batch_size"] = 32
compile_params["batch_size"] = 8
Expand Down
21 changes: 11 additions & 10 deletions tests/transformers/test_subfunction.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import onnx
import pytest
import torch
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from transformers import AutoConfig, AutoModelForCausalLM

from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM

Expand Down Expand Up @@ -65,7 +65,7 @@ def get_gpt2block_call_count(onnx_path):
@pytest.mark.on_qaic
@pytest.mark.parametrize("config", configs, ids=config_ids)
def test_subfunction_vs_nonsubfunction(config, tmp_path):
tokenizer = AutoTokenizer.from_pretrained(config.model_type)
# tokenizer = AutoTokenizer.from_pretrained(config.model_type)
model_0_0 = QEFFAutoModelForCausalLM(AutoModelForCausalLM.from_config(config, **model_kwargs), cb=False)

# Export with subfunctions enabled
Expand Down Expand Up @@ -104,16 +104,17 @@ def test_subfunction_vs_nonsubfunction(config, tmp_path):
"Expected NO QEffGPT2Block function calls in graph when use_onnx_subfunctions=False"
)

# TODO: Re-enable this check when generation is fully deterministic
# Compile and test generation to ensure functional equivalence
compile_params = {"prefill_seq_len": 8, "ctx_len": 16}
# compile_params = {"prefill_seq_len": 8, "ctx_len": 16}

model_0_0.compile(onnx_path=with_sub_func_onnx, **compile_params)
generation_00 = model_0_0.generate(prompts=["Help me with this"], tokenizer=tokenizer)
# model_0_0.compile(onnx_path=with_sub_func_onnx, **compile_params, use_onnx_subfunctions=True)
# generation_00 = model_0_0.generate(prompts=["Help me with this"], tokenizer=tokenizer)

model_0_0.compile(onnx_path=without_sub_func_onnx, **compile_params)
generation_01 = model_0_0.generate(prompts=["Help me with this"], tokenizer=tokenizer)
# model_0_0.compile(onnx_path=without_sub_func_onnx, **compile_params)
# generation_01 = model_0_0.generate(prompts=["Help me with this"], tokenizer=tokenizer)

# Verify that both models produce the same output
assert generation_00.generated_texts == generation_01.generated_texts, (
"Models with and without subfunctions should produce identical outputs"
)
# assert generation_00.generated_texts == generation_01.generated_texts, (
# "Models with and without subfunctions should produce identical outputs"
# )