update

dsikka · dsikka · commit e288735f6238 · 2025-05-22T17:28:06.000Z
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -91,7 +91,7 @@ def call_observer(module: Module, base_name: str, value: Optional[torch.Tensor]
         observer = getattr(module, f"{base_name}_observer")
         updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
 
-        if hasattr(module, "input_global_scale"):
+        if base_name == "input" and hasattr(module, "input_global_scale"):
             update_parameter_data(module, updated_scale, "input_global_scale")
         else:
             # update scale and zero point
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -93,7 +93,7 @@ def get_qparams(
                     observed, calculate_global_scale=True
                 )
 
-            if self.quantization_args.strategy == QuantizationStrategy.TENSOR:
+            elif self.quantization_args.strategy == QuantizationStrategy.TENSOR:
                 # re-calculate scale and zero point, update the stored value
                 self._scale, self._zero_point = self.calculate_qparams(observed)
 
diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py
@@ -94,8 +94,7 @@ def calculate_qparams(
             max_vals = torch.max(updated_max_val, torch.zeros_like(updated_max_val))
             max_val_pos = torch.max(torch.abs(min_vals), torch.abs(max_vals))
             global_scale = FP8_E4M3_DATA.max * FP4_E2M1_DATA.max / max_val_pos
-            print("global_scale")
-            return global_scale, None
+            return global_scale.to(torch.float32), None
 
         return calculate_qparams(
             min_vals=updated_min_val,
diff --git a/src/llmcompressor/transformers/compression/quantization_format.py b/src/llmcompressor/transformers/compression/quantization_format.py
@@ -61,13 +61,13 @@ def infer_quantization_format(
         )
         is_weight_only = len(input_args) == 0 and len(weight_args) > 0
 
-        if is_weight_only:  # w4a16 and w8a16
-            if (
-                weight_args[0].num_bits == 4
-                and weight_args[0].type == QuantizationType.FLOAT.value
-            ):
-                return CompressionFormat.nvfp4_pack_quantized
+        if (
+            weight_args[0].num_bits == 4
+            and weight_args[0].type == QuantizationType.FLOAT.value
+        ):
+            return CompressionFormat.nvfp4_pack_quantized
 
+        if is_weight_only:  # w4a16 and w8a16
             is_valid_pack = all(
                 weight_arg.num_bits in [4, 8]
                 and weight_arg.type == QuantizationType.INT.value

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ def get_qparams(`
`93`	`93`	`observed, calculate_global_scale=True`
`94`	`94`	`)`
`95`	`95`
`96`		`- if self.quantization_args.strategy == QuantizationStrategy.TENSOR:`
	`96`	`+ elif self.quantization_args.strategy == QuantizationStrategy.TENSOR:`
`97`	`97`	`# re-calculate scale and zero point, update the stored value`
`98`	`98`	`self._scale, self._zero_point = self.calculate_qparams(observed)`
`99`	`99`