added retrying downloads logic for stability (quic#370)

ochougul · eplatero97 · commit 8009d1ac8b2e · 2025-04-29T04:49:14.000-05:00
Signed-off-by: Onkar Chougule &lt;quic_ochougul@quicinc.com&gt;
diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
@@ -18,7 +18,7 @@
 from transformers import AutoConfig
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
-from QEfficient.transformers.modeling_utils import MODEL_CLASS_MAPPING
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
 from QEfficient.utils import login_and_download_hf_lm
 
 
diff --git a/scripts/replicate_kv_head/replicate_kv_heads.py b/scripts/replicate_kv_head/replicate_kv_heads.py
@@ -14,6 +14,7 @@
 from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers, undo_transformers_quantizers
 from QEfficient.transformers.quantizers.awq import WQLinear_GEMM
 from QEfficient.transformers.quantizers.gptq import QuantLinearGPTQ
+from QEfficient.transformers.quantizers.quantizer_compressed_tensors import FP8DeQuantLinear
 
 
 def duplicate_weights_for_linear_layer(
@@ -78,7 +79,6 @@ def main(args):
     model_kwargs = {"attn_implementation": "eager"}
     if args.num_hidden_layers:
         model_kwargs["num_hidden_layers"] = args.num_hidden_layers
-
     model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
 
     # Undo the effect of replace_transformers_quantizers
diff --git a/tests/transformers/spd/test_pld_inference.py b/tests/transformers/spd/test_pld_inference.py
@@ -145,9 +145,9 @@ def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
     """
     num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
     input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
-    assert input_len_padded <= ctx_len, (
-        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
-    )
+    assert (
+        input_len_padded <= ctx_len
+    ), "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
     return input_len_padded
 
 
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
@@ -86,9 +86,9 @@ def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
     """
     num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
     input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
-    assert input_len_padded <= ctx_len, (
-        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
-    )
+    assert (
+        input_len_padded <= ctx_len
+    ), "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
     return input_len_padded
 
 
@@ -335,9 +335,9 @@ def test_spec_decode_inference(
     for prompt, generation in zip(prompts, batch_decode):
         print(f"{prompt=} {generation=}")
     # validation check
-    assert mean_num_accepted_tokens == float(num_speculative_tokens + 1), (
-        f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens + 1}"
-    )
+    assert mean_num_accepted_tokens == float(
+        num_speculative_tokens + 1
+    ), f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens + 1}"
     del target_model_session
     del draft_model_session
     generated_ids = np.asarray(generated_ids[0]).flatten()