Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 14 additions & 18 deletions QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -2522,7 +2522,7 @@ def get_seq_len_and_handle_specialized_prefill_model(

num_q_blocks = os.environ.get("NUM_Q_BLOCKS", None)
if num_q_blocks is None:
block_size = 128
block_size = 256
if prefill_seq_len is None or prefill_seq_len % block_size != 0 or prefill_seq_len < 128:
raise ValueError(
f"When prefill_only=True, 'prefill_seq_len' must be explicitly set and divisible by block_size={block_size}. "
Expand Down Expand Up @@ -2933,20 +2933,24 @@ def compile(
If `prefill_seq_len` is less than `num_speculative_tokens + 1` for TLM models.

"""
if (kv_cache_batch_size or full_batch_size) and not self.continuous_batching:
logger.warning(
"`kv_cache_batch_size` or `full_batch_size` is being passed"
"This will be ignored as `continuous_batching` is set to `False` in `from_pretrained`"
)

if prefill_only is None or not prefill_only:
if self.continuous_batching and full_batch_size is None:
raise TypeError("`full_batch_size` is required when `continuous_batching=True`.")
if kv_cache_batch_size and not full_batch_size:

else:
if self.continuous_batching and kv_cache_batch_size is None and full_batch_size is None:
raise ValueError(
"KV caching requires continuous batching. Please set `full_batch_size` and "
"enable `continuous_batching=True` in `from_pretrained`."
"Please pass valid integer for kv_cache_batch_size or full_batch_size, both have same meaning, as continuous_batching is enabled for prefill-only model"
)
else:
if self.continuous_batching:
if not isinstance(kv_cache_batch_size, int):
raise ValueError(
"Please pass valid integer for kv_cache_batch_size as continuous_batching is enabled for prefill-only model"
)

# Infer kv_cache_batch_size if not provided
kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size

# if ccl_enabled is True read Compute-Context-Length lists
if self.ccl_enabled:
Expand Down Expand Up @@ -2989,14 +2993,6 @@ def compile(
):
raise ValueError("Currently, sampler does not support `num_speculative_tokens` > 0.")

if kv_cache_batch_size and prefill_only is not None and prefill_only:
logger.warning(
"kv_cache_batch_size will be ignored as prefill_only is set to True unless this is GPTOSS model"
)

# Infer kv_cache_batch_size if not provided
kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size

# --- Specializations ---
specializations = []
if prefill_only is None or prefill_only or prefill_seq_len == 1:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#
# -----------------------------------------------------------------------------

import os
import time

import numpy as np
Expand All @@ -14,7 +15,11 @@
from QEfficient import QEFFAutoModelForCausalLM
from QEfficient.generation.cloud_infer import QAICInferenceSession

model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32
dir_path = os.path.dirname(os.path.realpath(__file__))
subfunc_npi_file_path = os.path.join(dir_path, "subfunction_120b_npi.yaml")
non_subfunc_npi_file_path = os.path.join(dir_path, "non_subfunction_120b_npi.yaml")

model_id = "openai/gpt-oss-120b" # weights are not required to convert to fp32

prompt = """
Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures.
Expand All @@ -27,7 +32,7 @@
config = AutoConfig.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
PREFILL_SEQ_LEN = 128
CTX_LEN = 128 * 3
CTX_LEN = 8192

qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id)

Expand All @@ -43,6 +48,8 @@
num_speculative_tokens=None,
offload_pt_weights=False, # Need the weights in memory for prefill-model export/compilation in the next step
retain_full_kv=True,
# split_retained_state_io=True, # This should be used for disagg serving via VLLM
node_precision_info=non_subfunc_npi_file_path,
)


Expand All @@ -61,6 +68,8 @@
prefill_only=True,
enable_chunking=True,
use_onnx_subfunctions=True,
# split_retained_state_io=True, # This should be used for disagg serving via VLLM
node_precision_info=subfunc_npi_file_path,
)


Expand Down
Loading