Enable int8 and fp8 quantization for FLUX

cehongwang · cehongwang · commit 1d4ca9c9eed5 · 2025-04-24T01:58:42.000Z
diff --git a/examples/apps/flux-quantization.py b/examples/apps/flux-quantization.py
@@ -1,6 +1,9 @@
 # %%
 # Import the following libraries
 # -----------------------------
+# Load the ModelOpt-modified model architecture and weights using Huggingface APIs
+# Add argument parsing for dtype selection
+import argparse
 import re
 
 import modelopt.torch.opt as mto
@@ -14,8 +17,27 @@
 from torch.export._trace import _export
 from transformers import AutoModelForCausalLM
 
-# Load the ModelOpt-modified model architecture and weights using Huggingface APIs
+parser = argparse.ArgumentParser(
+    description="Run Flux quantization with different dtypes"
+)
+parser.add_argument(
+    "--dtype",
+    choices=["fp8", "int8"],
+    default="int8",
+    help="Quantization data type to use (fp8 or int8)",
+)
 
+args = parser.parse_args()
+
+# Update enabled precisions based on dtype argument
+if args.dtype == "fp8":
+    enabled_precisions = {torch.float8_e4m3fn, torch.float16}
+    ptq_config = mtq.FP8_DEFAULT_CFG
+    ptq_config["quant_cfg"]["*weight_quantizer"]["axis"] = None
+else:  # int8
+    enabled_precisions = {torch.int8, torch.float16}
+    ptq_config = mtq.INT8_DEFAULT_CFG
+print(f"\nUsing {args.dtype} quantization")
 # %%
 DEVICE = "cuda:0"
 pipe = FluxPipeline.from_pretrained(
@@ -83,11 +105,10 @@ def forward_loop(mod):
     )
 
 
-ptq_config = mtq.FP8_DEFAULT_CFG
 backbone = mtq.quantize(backbone, ptq_config, forward_loop)
 mtq.disable_quantizer(backbone, filter_func)
 
-batch_size = 1
+batch_size = 2
 BATCH = torch.export.Dim("batch", min=1, max=2)
 SEQ_LEN = torch.export.Dim("seq_len", min=1, max=512)
 # This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.
@@ -129,7 +150,7 @@ def forward_loop(mod):
         backbone,
         args=(),
         kwargs=dummy_inputs,
-        # dynamic_shapes=dynamic_shapes,
+        dynamic_shapes=dynamic_shapes,
         strict=False,
         allow_complex_guards_as_runtime_asserts=True,
     )
@@ -138,10 +159,10 @@ def forward_loop(mod):
     trt_gm = torch_tensorrt.dynamo.compile(
         ep,
         inputs=dummy_inputs,
-        enabled_precisions={torch.float8_e4m3fn, torch.float16},
+        enabled_precisions=enabled_precisions,
         truncate_double=True,
         min_block_size=1,
-        debug=True,
+        debug=False,
         use_python_runtime=True,
         immutable_weights=True,
         offload_module_to_cpu=True,
@@ -156,8 +177,27 @@ def forward_loop(mod):
 # %%
 trt_gm.device = torch.device(DEVICE)
 # Function which generates images from the flux pipeline
+generate_image(pipe, ["A golden retriever"], "dog_code2")
+
+
+def benchmark(prompt, inference_step, batch_size=2, iterations=1):
+    from time import time
+
+    start = time()
+    for i in range(iterations):
+        image = pipe(
+            prompt,
+            output_type="pil",
+            num_inference_steps=inference_step,
+            num_images_per_prompt=batch_size,
+        ).images
+    end = time()
+    print("Time Elapse for", iterations, "iterations:", end - start)
+    print("Average Latency Per Step:", (end - start) / inference_step / iterations)
+    return image
+
 
-for _ in range(2):
-    generate_image(pipe, ["A golden retriever holding a sign to code"], "dog_code")
+print("Benchmark Original PyTorch Module Latency (int8)")
+benchmark(["Test"], 50, iterations=3)
 
 # For this dummy model, the fp16 engine size is around 1GB, fp32 engine size is around 2GB
diff --git a/py/torch_tensorrt/dynamo/conversion/impl/quantize.py b/py/torch_tensorrt/dynamo/conversion/impl/quantize.py
@@ -84,6 +84,9 @@ def quantize(
         elif num_bits == 8 and exponent_bits == 4:
             dtype = trt.DataType.FP8
 
+        if not isinstance(input_tensor, TRTTensor):
+            input_tensor = get_trt_tensor(ctx, input_tensor, name + "_quantize_input")
+
         quantize_layer = ctx.net.add_quantize(input_tensor, scale, dtype)
 
         set_layer_name(quantize_layer, target, name + "_quantize", source_ir)
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py b/py/torch_tensorrt/dynamo/lowering/passes/constant_folding.py
@@ -101,4 +101,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 
     # TODO: Update this function when quantization is added
     def is_impure(self, node: torch.fx.node.Node) -> bool:
+        if node.target == torch.ops.tensorrt.quantize_op.default:
+            return True
         return False
diff --git a/tools/perf/Flux/benchmark.sh b/tools/perf/Flux/benchmark.sh
@@ -1,4 +1,5 @@
 #TODO: Enter the HF Token
 huggingface-cli login --token HF_TOKEN
 
-python flux_perf.py > benchmark_output.txt
+python flux_quantization.py --dtype fp8 > fp8_benchmark.txt
+python flux_quantization.py --dtype int8 > int8_benchmark.txt
diff --git a/tools/perf/Flux/flux-quantization.py b/tools/perf/Flux/flux-quantization.py
@@ -0,0 +1,202 @@
+# %%
+# Import the following libraries
+# -----------------------------
+# Load the ModelOpt-modified model architecture and weights using Huggingface APIs
+# Add argument parsing for dtype selection
+import argparse
+import re
+
+import modelopt.torch.opt as mto
+import modelopt.torch.quantization as mtq
+import torch
+import torch_tensorrt
+from diffusers import FluxPipeline
+from diffusers.models.attention_processor import Attention
+from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
+from modelopt.torch.quantization.utils import export_torch_mode
+from torch.export._trace import _export
+from transformers import AutoModelForCausalLM
+
+parser = argparse.ArgumentParser(
+    description="Run Flux quantization with different dtypes"
+)
+parser.add_argument(
+    "--dtype",
+    choices=["fp8", "int8"],
+    default="int8",
+    help="Quantization data type to use (fp8 or int8)",
+)
+
+args = parser.parse_args()
+
+# Update enabled precisions based on dtype argument
+if args.dtype == "fp8":
+    enabled_precisions = {torch.float8_e4m3fn, torch.float16}
+    ptq_config = mtq.FP8_DEFAULT_CFG
+else:  # int8
+    enabled_precisions = {torch.int8, torch.float16}
+    ptq_config = mtq.INT8_DEFAULT_CFG
+print(f"\nUsing {args.dtype} quantization")
+# %%
+DEVICE = "cuda:0"
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.float16,
+)
+pipe.transformer = FluxTransformer2DModel(
+    num_layers=1, num_single_layers=1, guidance_embeds=True
+)
+
+pipe.to(DEVICE).to(torch.float16)
+# Store the config and transformer backbone
+config = pipe.transformer.config
+# global backbone
+backbone = pipe.transformer
+backbone.eval()
+
+
+def filter_func(name):
+    pattern = re.compile(
+        r".*(time_emb_proj|time_embedding|conv_in|conv_out|conv_shortcut|add_embedding|pos_embed|time_text_embed|context_embedder|norm_out|x_embedder).*"
+    )
+    return pattern.match(name) is not None
+
+
+def generate_image(pipe, prompt, image_name):
+    seed = 42
+    image = pipe(
+        prompt,
+        output_type="pil",
+        num_inference_steps=20,
+        generator=torch.Generator("cuda").manual_seed(seed),
+    ).images[0]
+    image.save(f"{image_name}.png")
+    print(f"Image generated using {image_name} model saved as {image_name}.png")
+
+
+generate_image(pipe, ["A golden retriever holding a sign to code"], "dog_code")
+
+# %%
+# Quantization
+
+
+def do_calibrate(
+    pipe,
+    prompt: str,
+) -> None:
+    """
+    Run calibration steps on the pipeline using the given prompts.
+    """
+    image = pipe(
+        prompt,
+        output_type="pil",
+        num_inference_steps=20,
+        generator=torch.Generator("cuda").manual_seed(0),
+    ).images[0]
+
+
+def forward_loop(mod):
+    # Switch the pipeline's backbone, run calibration
+    pipe.transformer = mod
+    do_calibrate(
+        pipe=pipe,
+        prompt="test",
+    )
+
+
+backbone = mtq.quantize(backbone, ptq_config, forward_loop)
+mtq.disable_quantizer(backbone, filter_func)
+
+batch_size = 2
+BATCH = torch.export.Dim("batch", min=1, max=2)
+SEQ_LEN = torch.export.Dim("seq_len", min=1, max=512)
+# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.
+# To see this recommendation, you can try exporting using min=1, max=4096
+IMG_ID = torch.export.Dim("img_id", min=3586, max=4096)
+dynamic_shapes = {
+    "hidden_states": {0: BATCH},
+    "encoder_hidden_states": {0: BATCH, 1: SEQ_LEN},
+    "pooled_projections": {0: BATCH},
+    "timestep": {0: BATCH},
+    "txt_ids": {0: SEQ_LEN},
+    "img_ids": {0: IMG_ID},
+    "guidance": {0: BATCH},
+    "joint_attention_kwargs": {},
+    "return_dict": None,
+}
+# The guidance factor is of type torch.float32
+dummy_inputs = {
+    "hidden_states": torch.randn((batch_size, 4096, 64), dtype=torch.float16).to(
+        DEVICE
+    ),
+    "encoder_hidden_states": torch.randn(
+        (batch_size, 512, 4096), dtype=torch.float16
+    ).to(DEVICE),
+    "pooled_projections": torch.randn((batch_size, 768), dtype=torch.float16).to(
+        DEVICE
+    ),
+    "timestep": torch.tensor([1.0] * batch_size, dtype=torch.float16).to(DEVICE),
+    "txt_ids": torch.randn((512, 3), dtype=torch.float16).to(DEVICE),
+    "img_ids": torch.randn((4096, 3), dtype=torch.float16).to(DEVICE),
+    "guidance": torch.tensor([1.0] * batch_size, dtype=torch.float32).to(DEVICE),
+    "joint_attention_kwargs": {},
+    "return_dict": False,
+}
+
+# This will create an exported program which is going to be compiled with Torch-TensorRT
+with export_torch_mode():
+    ep = _export(
+        backbone,
+        args=(),
+        kwargs=dummy_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=False,
+        allow_complex_guards_as_runtime_asserts=True,
+    )
+
+with torch_tensorrt.logging.debug():
+    trt_gm = torch_tensorrt.dynamo.compile(
+        ep,
+        inputs=dummy_inputs,
+        enabled_precisions=enabled_precisions,
+        truncate_double=True,
+        min_block_size=1,
+        debug=False,
+        use_python_runtime=True,
+        immutable_weights=True,
+        offload_module_to_cpu=True,
+    )
+
+
+del ep
+pipe.transformer = trt_gm
+pipe.transformer.config = config
+
+
+# %%
+trt_gm.device = torch.device(DEVICE)
+# Function which generates images from the flux pipeline
+generate_image(pipe, ["A golden retriever"], "dog_code2")
+
+
+def benchmark(prompt, inference_step, batch_size=2, iterations=1):
+    from time import time
+
+    start = time()
+    for i in range(iterations):
+        image = pipe(
+            prompt,
+            output_type="pil",
+            num_inference_steps=inference_step,
+            num_images_per_prompt=batch_size,
+        ).images
+    end = time()
+    print("Time Elapse for", iterations, "iterations:", end - start)
+    print("Average Latency Per Step:", (end - start) / inference_step / iterations)
+    return image
+
+
+print("Benchmark Original PyTorch Module Latency (int8)")
+benchmark(["Test"], 50, iterations=3)
+
+# For this dummy model, the fp16 engine size is around 1GB, fp32 engine size is around 2GB