quic · quic-hemagnih · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
@@ -2522,7 +2522,7 @@ def get_seq_len_and_handle_specialized_prefill_model(
 
         num_q_blocks = os.environ.get("NUM_Q_BLOCKS", None)
         if num_q_blocks is None:
-            block_size = 128
+            block_size = 256
             if prefill_seq_len is None or prefill_seq_len % block_size != 0 or prefill_seq_len < 128:
                 raise ValueError(
                     f"When prefill_only=True, 'prefill_seq_len' must be explicitly set and divisible by block_size={block_size}. "
@@ -2933,20 +2933,24 @@ def compile(
             If `prefill_seq_len` is less than `num_speculative_tokens + 1` for TLM models.
 
         """
+        if (kv_cache_batch_size or full_batch_size) and not self.continuous_batching:
+            logger.warning(
+                "`kv_cache_batch_size` or `full_batch_size` is being passed"
+                "This will be ignored as `continuous_batching` is set to `False` in `from_pretrained`"
+            )
+
         if prefill_only is None or not prefill_only:
             if self.continuous_batching and full_batch_size is None:
                 raise TypeError("`full_batch_size` is required when `continuous_batching=True`.")
-            if kv_cache_batch_size and not full_batch_size:
+
+        else:
+            if self.continuous_batching and kv_cache_batch_size is None and full_batch_size is None:
                 raise ValueError(
-                    "KV caching requires continuous batching. Please set `full_batch_size` and "
-                    "enable `continuous_batching=True` in `from_pretrained`."
+                    "Please pass valid integer for kv_cache_batch_size or full_batch_size, both have same meaning, as continuous_batching is enabled for prefill-only model"
                 )
-        else:
-            if self.continuous_batching:
-                if not isinstance(kv_cache_batch_size, int):
-                    raise ValueError(
-                        "Please pass valid integer for kv_cache_batch_size as continuous_batching is enabled for prefill-only model"
-                    )
+
+        # Infer kv_cache_batch_size if not provided
+        kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size
 
         # if ccl_enabled is True read Compute-Context-Length lists
         if self.ccl_enabled:
@@ -2989,14 +2993,6 @@ def compile(
         ):
             raise ValueError("Currently, sampler does not support `num_speculative_tokens` > 0.")
 
-        if kv_cache_batch_size and prefill_only is not None and prefill_only:
-            logger.warning(
-                "kv_cache_batch_size will be ignored as prefill_only is set to True unless this is GPTOSS model"
-            )
-
-        # Infer kv_cache_batch_size if not provided
-        kv_cache_batch_size = kv_cache_batch_size or full_batch_size or batch_size
-
         # --- Specializations ---
         specializations = []
         if prefill_only is None or prefill_only or prefill_seq_len == 1:

@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 import time
 
 import numpy as np
@@ -14,7 +15,11 @@
 from QEfficient import QEFFAutoModelForCausalLM
 from QEfficient.generation.cloud_infer import QAICInferenceSession
 
-model_id = "openai/gpt-oss-20b"  # weights are not required to convert to fp32
+dir_path = os.path.dirname(os.path.realpath(__file__))
+subfunc_npi_file_path = os.path.join(dir_path, "subfunction_120b_npi.yaml")
+non_subfunc_npi_file_path = os.path.join(dir_path, "non_subfunction_120b_npi.yaml")
+
+model_id = "openai/gpt-oss-120b"  # weights are not required to convert to fp32
 
 prompt = """
 Once upon a time, in a small town, there lived a young boy named Alex. Alex was a curious and adventurous child, always eager to explore the world around him. One day, while playing in the park, Alex stumbled upon a mysterious old book hidden beneath a pile of leaves. The book was filled with stories of distant lands, magical creatures, and extraordinary adventures.
@@ -27,7 +32,7 @@
 config = AutoConfig.from_pretrained(model_id)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 PREFILL_SEQ_LEN = 128
-CTX_LEN = 128 * 3
+CTX_LEN = 8192
 
 qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_id)
 
@@ -43,6 +48,8 @@
     num_speculative_tokens=None,
     offload_pt_weights=False,  # Need the weights in memory for prefill-model export/compilation in the next step
     retain_full_kv=True,
+    # split_retained_state_io=True,   # This should be used for disagg serving via VLLM
+    node_precision_info=non_subfunc_npi_file_path,
 )
 
 
@@ -61,6 +68,8 @@
     prefill_only=True,
     enable_chunking=True,
     use_onnx_subfunctions=True,
+    # split_retained_state_io=True,  # This should be used for disagg serving via VLLM
+    node_precision_info=subfunc_npi_file_path,
 )