[https://nvbugs/5599086][fix] Fix FP8 Linear module for spark

SimengLiu-nv · SimengLiu-nv · commit 231162414d7b · 2025-10-27T18:56:03.000-07:00
Signed-off-by: Simeng Liu &lt;simengl@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -449,11 +449,17 @@ def load_weights_fused_qkv_linear(self, module: Linear,
         v_weight = v_weight.to(module.dtype) * weight_scale[2]
 
         fused_weight = torch.cat((q_weight, k_weight, v_weight))
+        original_device = module.weight_scale.device
+        # module.weight_scale and fused_weight must be on the same device for division operation
+        # Reset the device of module.weight_scale to the original device if changed
         if module.weight_scale.device != fused_weight.device:
             module.weight_scale = Parameter(
                 module.weight_scale.data.to(fused_weight.device))
         fused_weight = (fused_weight / module.weight_scale).to(
             torch.float8_e4m3fn)
+        if original_device != module.weight_scale.device:
+            module.weight_scale = Parameter(
+                module.weight_scale.data.to(original_device))
         copy_weight(module.weight, fused_weight)
 
         # Load k and v scales, used for NVFP4 KV cache
@@ -489,11 +495,17 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
         gate_weight = gate_weight.to(module.dtype) * weight_scale[0]
         up_weight = up_weight.to(module.dtype) * weight_scale[1]
         fused_weight = torch.cat((gate_weight, up_weight))
+        original_device = module.weight_scale.device
+        # module.weight_scale and fused_weight must be on the same device for division operation
+        # Reset the device of module.weight_scale to the original device if changed
         if module.weight_scale.device != fused_weight.device:
             module.weight_scale = Parameter(
                 module.weight_scale.data.to(fused_weight.device))
         fused_weight = (fused_weight / module.weight_scale).to(
             torch.float8_e4m3fn)
+        if original_device != module.weight_scale.device:
+            module.weight_scale = Parameter(
+                module.weight_scale.data.to(original_device))
         copy_weight(module.weight, fused_weight)