diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index be0981da3..236f6c9f5 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -2522,7 +2522,7 @@ def get_seq_len_and_handle_specialized_prefill_model( num_q_blocks = os.environ.get("NUM_Q_BLOCKS", None) if num_q_blocks is None: - block_size = 128 + block_size = 256 if prefill_seq_len is None or prefill_seq_len % block_size != 0 or prefill_seq_len < 128: raise ValueError( f"When prefill_only=True, 'prefill_seq_len' must be explicitly set and divisible by block_size={block_size}. " @@ -2933,20 +2933,24 @@ def compile( If `prefill_seq_len` is less than `num_speculative_tokens + 1` for TLM models. """ + if (kv_cache_batch_size or full_batch_size) and not self.continuous_batching: + logger.warning( + "`kv_cache_batch_size` or `full_batch_size` is being passed" + "This will be ignored as `continuous_batching` is set to `False` in `from_pretrained`" + ) + if prefill_only is None or not prefill_only: if self.continuous_batching and full_batch_size is None: raise TypeError("`full_batch_size` is required when `continuous_batching=True`.") - if kv_cache_batch_size and not full_batch_size: + + else: + if self.continuous_batching and kv_cache_batch_size is None and full_batch_size is None: raise ValueError( - "KV caching requires continuous batching. Please set `full_batch_size` and " - "enable `continuous_batching=True` in `from_pretrained`." + "Please pass valid integer for kv_cache_batch_size or full_batch_size, both have same meaning, as continuous_batching is enabled for prefill-only model" ) - else: - if self.continuous_batching: - if not isinstance(kv_cache_batch_size, int): - raise ValueError( - "Please pass valid integer for kv_cache_batch_size as continuous_batching is enabled for prefill-only model" - ) + + # Infer kv_cache_batch_size if not provided + kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size # if ccl_enabled is True read Compute-Context-Length lists if self.ccl_enabled: @@ -2989,14 +2993,6 @@ def compile( ): raise ValueError("Currently, sampler does not support `num_speculative_tokens` > 0.") - if kv_cache_batch_size and prefill_only is not None and prefill_only: - logger.warning( - "kv_cache_batch_size will be ignored as prefill_only is set to True unless this is GPTOSS model" - ) - - # Infer kv_cache_batch_size if not provided - kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size - # --- Specializations --- specializations = [] if prefill_only is None or prefill_only or prefill_seq_len == 1: diff --git a/examples/gpt_oss_disagg_mode_with_chunking.py b/examples/disagg_serving/gpt_oss_disagg_mode_with_chunking.py similarity index 90% rename from examples/gpt_oss_disagg_mode_with_chunking.py rename to examples/disagg_serving/gpt_oss_disagg_mode_with_chunking.py index 363e2806c..cac646d5e 100644 --- a/examples/gpt_oss_disagg_mode_with_chunking.py +++ b/examples/disagg_serving/gpt_oss_disagg_mode_with_chunking.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import os import time import numpy as np @@ -14,7 +15,11 @@ from QEfficient import QEFFAutoModelForCausalLM from QEfficient.generation.cloud_infer import QAICInferenceSession -model_id = "openai/gpt-oss-20b" # weights are not required to convert to fp32 +dir_path = os.path.dirname(os.path.realpath(__file__)) +subfunc_npi_file_path = os.path.join(dir_path, "subfunction_120b_npi.yaml") +non_subfunc_npi_file_path = os.path.join(dir_path, "non_subfunction_120b_npi.yaml") + +model_id = "openai/gpt-oss-120b" # weights are not required to convert to fp32 prompt = """ Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures. @@ -27,7 +32,7 @@ config = AutoConfig.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) PREFILL_SEQ_LEN = 128 -CTX_LEN = 128 * 3 +CTX_LEN = 8192 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id) @@ -43,6 +48,8 @@ num_speculative_tokens=None, offload_pt_weights=False, # Need the weights in memory for prefill-model export/compilation in the next step retain_full_kv=True, + # split_retained_state_io=True, # This should be used for disagg serving via VLLM + node_precision_info=non_subfunc_npi_file_path, ) @@ -61,6 +68,8 @@ prefill_only=True, enable_chunking=True, use_onnx_subfunctions=True, + # split_retained_state_io=True, # This should be used for disagg serving via VLLM + node_precision_info=subfunc_npi_file_path, ) diff --git a/examples/disagg_serving/without_subfunc_npi_120b.yaml b/examples/disagg_serving/non_subfunction_120b_npi.yaml similarity index 100% rename from examples/disagg_serving/without_subfunc_npi_120b.yaml rename to examples/disagg_serving/non_subfunction_120b_npi.yaml