Supported LoRA

cehongwang · cehongwang · commit 18918ff10eff · 2025-03-15T00:51:45.000Z
diff --git a/examples/apps/flux-demo.py b/examples/apps/flux-demo.py
@@ -0,0 +1,143 @@
+import os
+
+import gradio as gr
+import torch
+import torch_tensorrt
+from diffusers import FluxPipeline, StableDiffusionPipeline
+from torch.export._trace import _export
+
+DEVICE = "cuda:0"
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.float16,
+)
+pipe.to(DEVICE).to(torch.float16)
+backbone = pipe.transformer
+
+
+batch_size = 2
+BATCH = torch.export.Dim("batch", min=1, max=8)
+
+# This particular min, max values for img_id input are recommended by torch dynamo during the export of the model.
+# To see this recommendation, you can try exporting using min=1, max=4096
+dynamic_shapes = {
+    "hidden_states": {0: BATCH},
+    "encoder_hidden_states": {0: BATCH},
+    "pooled_projections": {0: BATCH},
+    "timestep": {0: BATCH},
+    "txt_ids": {},
+    "img_ids": {},
+    "guidance": {0: BATCH},
+    "joint_attention_kwargs": {},
+    "return_dict": None,
+}
+
+settings = {
+    "strict": False,
+    "allow_complex_guards_as_runtime_asserts": True,
+    "enabled_precisions": {torch.float32},
+    "truncate_double": True,
+    "min_block_size": 1,
+    "use_fp32_acc": True,
+    "use_explicit_typing": True,
+    "debug": False,
+    "use_python_runtime": True,
+    "immutable_weights": False,
+}
+
+trt_gm = torch_tensorrt.MutableTorchTensorRTModule(backbone, **settings)
+trt_gm.set_expected_dynamic_shape_range((), dynamic_shapes)
+pipe.transformer = trt_gm
+
+
+def generate_image(prompt, inference_step, batch_size=1):
+    image = pipe(
+        prompt,
+        output_type="pil",
+        num_inference_steps=inference_step,
+        num_images_per_prompt=batch_size,
+    ).images
+    return image
+
+
+generate_image(["A golden retriever holding a sign to code"], 2)
+
+
+def model_change(model):
+    if model == "Torch Model":
+        pipe.transformer = backbone
+        backbone.to(DEVICE)
+    else:
+        backbone.to("cpu")
+        pipe.transformer = trt_gm
+        torch.cuda.empty_cache()
+
+
+def load_lora(path):
+
+    pipe.load_lora_weights(
+        path,
+        adapter_name="lora1",
+    )
+    pipe.set_adapters(["lora1"], adapter_weights=[1])
+    pipe.fuse_lora()
+    pipe.unload_lora_weights()
+    print("LoRA loaded!")
+
+
+# Create Gradio interface
+with gr.Blocks(title="Flux Demo with Torch-TensorRT") as demo:
+    gr.Markdown("# Flux Image Generation Demo Accelerated by Torch-TensorRT")
+
+    with gr.Row():
+        with gr.Column():
+            # Input components
+            prompt_input = gr.Textbox(
+                label="Prompt", placeholder="Enter your prompt here...", lines=3
+            )
+            model_dropdown = gr.Dropdown(
+                choices=["Torch Model", "Torch-TensorRT Accelerated Model"],
+                value="Torch-TensorRT Accelerated Model",
+                label="Model Variant",
+            )
+
+            lora_upload_path = gr.Textbox(
+                label="LoRA Path",
+                placeholder="/home/TensorRT/examples/apps/NGRVNG.safetensors",
+                lines=2,
+            )
+            num_steps = gr.Slider(
+                minimum=20, maximum=100, value=20, step=1, label="Inference Steps"
+            )
+            batch_size = gr.Slider(
+                minimum=1, maximum=8, value=1, step=1, label="Batch Size"
+            )
+
+            generate_btn = gr.Button("Generate Image")
+            load_lora_btn = gr.Button("Load LoRA")
+
+        with gr.Column():
+            # Output component
+            output_image = gr.Gallery(label="Generated Image")
+
+    # Connect the button to the generation function
+    model_dropdown.change(model_change, inputs=[model_dropdown])
+    generate_btn.click(
+        fn=generate_image,
+        inputs=[
+            prompt_input,
+            num_steps,
+            batch_size,
+        ],
+        outputs=output_image,
+    )
+    load_lora_btn.click(
+        fn=load_lora,
+        inputs=[
+            lora_upload_path,
+        ],
+    )
+
+# Launch the interface
+if __name__ == "__main__":
+    demo.launch()
diff --git a/examples/apps/flux_demo.py b/examples/apps/flux_demo.py
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -469,14 +469,12 @@ def _save_weight_mapping(self) -> None:
         # Stage 1: Name mapping
         torch_device = to_torch_device(self.compilation_settings.device)
         gm_is_on_cuda = get_model_device(self.module).type == "cuda"
-        if not gm_is_on_cuda:
-            # If the model original position is on CPU, move it GPU
-            sd = {
-                k: v.reshape(-1).to(torch_device)
-                for k, v in self.module.state_dict().items()
-            }
-        else:
-            sd = {k: v.reshape(-1) for k, v in self.module.state_dict().items()}
+        # If the model original position is on CPU, move it GPU
+        sd = {
+            k: v.reshape(-1).to(torch_device)
+            for k, v in self.module.state_dict().items()
+        }
+
         weight_name_map: dict[str, Any] = {}
         np_map = {}
         constant_mapping = {}
diff --git a/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_MutableTorchTensorRTModule.py
@@ -62,6 +62,8 @@ def __init__(
         device: Optional[Union[Device, torch.device, str]] = _defaults.DEVICE,
         use_python_runtime: bool = _defaults.USE_PYTHON_RUNTIME,
         immutable_weights: bool = False,
+        strict: bool = True,
+        allow_complex_guards_as_runtime_asserts: bool = False,
         **kwargs: Any,
     ) -> None:
         """
@@ -125,6 +127,10 @@ def __init__(
         self.arg_inputs: tuple[Any, ...] = tuple()
         self.kwarg_inputs: dict[str, Any] = {}
         self.additional_settings = kwargs
+        self.strict = strict
+        self.allow_complex_guards_as_runtime_asserts = (
+            allow_complex_guards_as_runtime_asserts
+        )
         self.use_python_runtime = use_python_runtime
         self.trt_device = to_torch_tensorrt_device(device)
         assert (
@@ -262,9 +268,7 @@ def refit_gm(self) -> None:
         """
         self.original_model.to(to_torch_device(self.trt_device))
         if self.exp_program is None:
-            self.exp_program = torch.export.export(
-                self.original_model, self.arg_inputs, kwargs=self.kwarg_inputs
-            )
+            self.exp_program = self.get_exported_program()
         else:
             self.exp_program._state_dict = (
                 MutableTorchTensorRTModule._transform_state_dict(
@@ -283,6 +287,25 @@ def refit_gm(self) -> None:
         self.original_model.cpu()
         torch.cuda.empty_cache()
 
+    def get_exported_program(self) -> torch.export.ExportedProgram:
+        if self.allow_complex_guards_as_runtime_asserts:
+            return torch.export._trace._export(
+                self.original_model,
+                self.arg_inputs,
+                kwargs=self.kwarg_inputs,
+                dynamic_shapes=self._get_total_dynamic_shapes(),
+                strict=self.strict,
+                allow_complex_guards_as_runtime_asserts=self.allow_complex_guards_as_runtime_asserts,
+            )
+        else:
+            return torch.export.export(
+                self.original_model,
+                self.arg_inputs,
+                kwargs=self.kwarg_inputs,
+                dynamic_shapes=self._get_total_dynamic_shapes(),
+                strict=self.strict,
+            )
+
     def compile(self) -> None:
         """
         (Re)compile the TRT graph module using the PyTorch module.
@@ -292,15 +315,7 @@ def compile(self) -> None:
         """
         # Export the module
         self.original_model.to(to_torch_device(self.trt_device))
-        self.exp_program = torch.export._trace._export(
-            self.original_model,
-            self.arg_inputs,
-            kwargs=self.kwarg_inputs,
-            dynamic_shapes=self._get_total_dynamic_shapes(),
-            strict=False,
-            allow_complex_guards_as_runtime_asserts=True,
-            # **self.additional_settings
-        )
+        self.exp_program = self.get_exported_program()
         self.gm = dynamo_compile(
             self.exp_program,
             arg_inputs=self.arg_inputs,