Qwen-Image-Edit-Angles

Running on Zero

App Files Files Community

Elea Zhong commited on about 1 month ago

Commit

87c9ae9

1 Parent(s): 65e075c

run experiments

Browse files

Files changed (9) hide show

app.py +5 -43
qwenimage/experiments/experiments_qwen.py +33 -17
qwenimage/models/pipeline_qwenimage_edit_plus.py +14 -10
qwenimage/optimization.py +1 -1
scripts/plot_data.ipynb +0 -0
scripts/run_experiment.py +26 -12
scripts/run_experiment_modal.py +0 -21
scripts/run_multi.py +32 -0
scripts/run_multi_experiments.py +0 -87

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ from torchao.quantization import quantize_
 from torchao.quantization import Int8WeightOnlyConfig
 from qwenimage.debug import ftimed
 from qwenimage.optimization import optimize_pipeline_
 from qwenimage.prompt import build_camera_prompt
 from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
@@ -28,49 +29,10 @@ from qwenimage.models.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
-pipe = QwenImageEditPlusPipeline.from_pretrained("Qwen/Qwen-Image-Edit-2509",
-                                                transformer= QwenImageTransformer2DModel.from_pretrained("linoyts/Qwen-Image-Edit-Rapid-AIO",
-                                                                                                         subfolder='transformer',
-                                                                                                         torch_dtype=dtype,
-                                                                                                         device_map='cuda'),torch_dtype=dtype).to(device)
-pipe.load_lora_weights(
-        "dx8152/Qwen-Edit-2509-Multiple-angles",
-        weight_name="镜头转换.safetensors", adapter_name="angles"
-    )
-# pipe.load_lora_weights(
-#         "lovis93/next-scene-qwen-image-lora-2509",
-#         weight_name="next-scene_lora-v2-3000.safetensors", adapter_name="next-scene"
-#     )
-pipe.set_adapters(["angles"], adapter_weights=[1.])
-pipe.fuse_lora(adapter_names=["angles"], lora_scale=1.25)
-# pipe.fuse_lora(adapter_names=["next-scene"], lora_scale=1.)
-pipe.unload_lora_weights()
-pipe.transformer.__class__ = QwenImageTransformer2DModel
-pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
-# transformer_clone = copy.deepcopy(pipe.transformer)
-# quantize_(pipe.transformer, Int8WeightOnlyConfig())
-# torch.save(pipe.transformer.state_dict(), "transformer_int8.pt")
-# assert False
-# from torchao.quantization import Int8DynamicActivationInt4WeightConfig
-# quantize_(pipe.transformer, Int8DynamicActivationInt4WeightConfig())
-optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024))], prompt="prompt", height=1024, width=1024, num_inference_steps=4)
-# state_dict = torch.load("transformer_int8.pt")
-# print(state_dict.keys())
-# # state_dict = pipe.transformer.state_dict()
-# print(pipe.transformer.state_dict().keys())
-# zerogpu_weights = ZeroGPUWeights({name: weight for name, weight in state_dict.items()})
-# compiled_transformer = ZeroGPUCompiledModel("transformer.pt2", zerogpu_weights)
-# spaces.aoti_apply(compiled_transformer, pipe.transformer)
 MAX_SEED = np.iinfo(np.int32).max

 from torchao.quantization import Int8WeightOnlyConfig
 from qwenimage.debug import ftimed
+from qwenimage.experiments.experiments_qwen import Qwen_FA3_AoT_int8, Qwen_int4
 from qwenimage.optimization import optimize_pipeline_
 from qwenimage.prompt import build_camera_prompt
 from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 dtype = torch.bfloat16
 device = "cuda" if torch.cuda.is_available() else "cpu"
+exp = Qwen_FA3_AoT_int8()
+exp.load()
+exp.optimize()
+pipe = exp.pipe
 MAX_SEED = np.iinfo(np.int32).max

qwenimage/experiments/experiments_qwen.py CHANGED Viewed

@@ -4,16 +4,26 @@ import os
 from pathlib import Path
 import random
 import statistics
 import torch
 from PIL import Image
 import pandas as pd
 from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 from qwenimage.models.transformer_qwenimage import QwenImageTransformer2DModel
 from qwenimage.models.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
 from qwenimage.experiment import AbstractExperiment, ExperimentConfig
 from qwenimage.debug import ProfileSession, ftimed
-from qwenimage.optimization import optimize_pipeline_
 from qwenimage.prompt import build_camera_prompt
@@ -124,9 +134,6 @@ class QwenBaseExperiment(AbstractExperiment):
     @ftimed
     def optimize(self):
-        # pipe.transformer.__class__ = QwenImageTransformer2DModel
-        # pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
-        # optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024))], prompt="prompt", height=1024, width=1024, num_inference_steps=4)
         pass
     @ftimed
@@ -181,7 +188,7 @@ class Qwen_FA3(QwenBaseExperiment):
 class Qwen_AoT(QwenBaseExperiment):
     @ftimed
     def optimize(self):
-        self.compiled_transformer = optimize_pipeline_(
             self.pipe,
             cache_compiled=self.config.cache_compiled,
             quantize=False,
@@ -191,10 +198,7 @@ class Qwen_AoT(QwenBaseExperiment):
                 "num_inference_steps":4
             }
         )
-    def cleanup(self):
-        super().cleanup()
-        del self.compiled_transformer
 @ExperimentRegistry.register(name="qwen_fa3_aot")
 class Qwen_FA3_AoT(QwenBaseExperiment):
@@ -202,7 +206,7 @@ class Qwen_FA3_AoT(QwenBaseExperiment):
     def optimize(self):
         self.pipe.transformer.__class__ = QwenImageTransformer2DModel
         self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
-        self.compiled_transformer = optimize_pipeline_(
             self.pipe,
             cache_compiled=self.config.cache_compiled,
             quantize=False,
@@ -214,9 +218,6 @@ class Qwen_FA3_AoT(QwenBaseExperiment):
             }
         )
-    def cleanup(self):
-        super().cleanup()
-        del self.compiled_transformer
 @ExperimentRegistry.register(name="qwen_fa3_aot_int8")
 class Qwen_FA3_AoT_int8(QwenBaseExperiment):
@@ -224,7 +225,7 @@ class Qwen_FA3_AoT_int8(QwenBaseExperiment):
     def optimize(self):
         self.pipe.transformer.__class__ = QwenImageTransformer2DModel
         self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
-        self.compiled_transformer = optimize_pipeline_(
             self.pipe,
             cache_compiled=self.config.cache_compiled,
             quantize=True,
@@ -236,6 +237,21 @@ class Qwen_FA3_AoT_int8(QwenBaseExperiment):
             }
         )
-    def cleanup(self):
-        super().cleanup()
-        del self.compiled_transformer

 from pathlib import Path
 import random
 import statistics
+import os
 import torch
 from PIL import Image
 import pandas as pd
+from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights
+from torchao.quantization import Float8WeightOnlyConfig, Int4WeightOnlyConfig, Int8DynamicActivationInt4WeightConfig, Int8DynamicActivationInt8WeightConfig, quantize_
+from torchao.quantization import Int8WeightOnlyConfig
+import spaces
+import torch
+from torch.utils._pytree import tree_map
+from torchao.utils import get_model_size_in_bytes
+from qwenimage.debug import ftimed, print_first_param
 from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 from qwenimage.models.transformer_qwenimage import QwenImageTransformer2DModel
 from qwenimage.models.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
 from qwenimage.experiment import AbstractExperiment, ExperimentConfig
 from qwenimage.debug import ProfileSession, ftimed
+from qwenimage.optimization import INDUCTOR_CONFIGS, TRANSFORMER_DYNAMIC_SHAPES, aoti_apply, drain_module_parameters, optimize_pipeline_
 from qwenimage.prompt import build_camera_prompt
     @ftimed
     def optimize(self):
         pass
     @ftimed
 class Qwen_AoT(QwenBaseExperiment):
     @ftimed
     def optimize(self):
+        optimize_pipeline_(
             self.pipe,
             cache_compiled=self.config.cache_compiled,
             quantize=False,
                 "num_inference_steps":4
             }
         )
 @ExperimentRegistry.register(name="qwen_fa3_aot")
 class Qwen_FA3_AoT(QwenBaseExperiment):
     def optimize(self):
         self.pipe.transformer.__class__ = QwenImageTransformer2DModel
         self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
+        optimize_pipeline_(
             self.pipe,
             cache_compiled=self.config.cache_compiled,
             quantize=False,
             }
         )
 @ExperimentRegistry.register(name="qwen_fa3_aot_int8")
 class Qwen_FA3_AoT_int8(QwenBaseExperiment):
     def optimize(self):
         self.pipe.transformer.__class__ = QwenImageTransformer2DModel
         self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
+        optimize_pipeline_(
             self.pipe,
             cache_compiled=self.config.cache_compiled,
             quantize=True,
             }
         )
+@ExperimentRegistry.register(name="qwen_fp8")
+class Qwen_fp8(QwenBaseExperiment):
+    @ftimed
+    def optimize(self):
+        self.pipe.transformer.__class__ = QwenImageTransformer2DModel
+        self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
+        quantize_(self.pipe.transformer, Float8WeightOnlyConfig())
+@ExperimentRegistry.register(name="qwen_int8")
+class Qwen_int8(QwenBaseExperiment):
+    @ftimed
+    def optimize(self):
+        self.pipe.transformer.__class__ = QwenImageTransformer2DModel
+        self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
+        quantize_(self.pipe.transformer, Int8WeightOnlyConfig())

qwenimage/models/pipeline_qwenimage_edit_plus.py CHANGED Viewed

@@ -876,11 +876,12 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                         if XLA_AVAILABLE:
                             xm.mark_step()
-        with ctimed("Post (vae)"):
-            self._current_timestep = None
-            if output_type == "latent":
-                image = latents
-            else:
                 latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
                 latents = latents.to(self.vae.dtype)
                 latents_mean = (
@@ -892,14 +893,17 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
                     latents.device, latents.dtype
                 )
                 latents = latents / latents_std + latents_mean
-                with ctimed("vae.decode"):
-                    image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
                 image = self.image_processor.postprocess(image, output_type=output_type)
-            # Offload all models
             self.maybe_free_model_hooks()
-            if not return_dict:
-                return (image,)
         return QwenImagePipelineOutput(images=image)

                         if XLA_AVAILABLE:
                             xm.mark_step()
+        # with ctimed("Post (vae)"):
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            with ctimed("pre decode"):
                 latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
                 latents = latents.to(self.vae.dtype)
                 latents_mean = (
                     latents.device, latents.dtype
                 )
                 latents = latents / latents_std + latents_mean
+            with ctimed("vae.decode"):
+                image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
+            with ctimed("post process"):
                 image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        with ctimed("offload"):
             self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
         return QwenImagePipelineOutput(images=image)

qwenimage/optimization.py CHANGED Viewed

@@ -68,6 +68,7 @@ def drain_module_parameters(module: torch.nn.Module):
 @ftimed
 def optimize_pipeline_(
         pipeline: Callable[P, Any],
         cache_compiled=True,
@@ -116,4 +117,3 @@ def optimize_pipeline_(
     aoti_apply(compiled_transformer, pipeline.transformer)
-    return compiled_transformer

 @ftimed
+@spaces.GPU(duration=1500)
 def optimize_pipeline_(
         pipeline: Callable[P, Any],
         cache_compiled=True,
     aoti_apply(compiled_transformer, pipeline.transformer)

scripts/plot_data.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/run_experiment.py CHANGED Viewed

@@ -1,19 +1,33 @@
 from qwenimage.experiment import ExperimentConfig
-from qwenimage.experiments.experiments_qwen import Qwen_AoT, QwenBaseExperiment, ExperimentRegistry
-# experiment = QwenBaseExperiment(ExperimentConfig(name="qwen-base"))
-# experiment.load()
-# experiment.optimize()
-# experiment.run()
-# experiment.report()
-experiment = Qwen_AoT(ExperimentConfig(name="qwen-aot"))
-experiment.load()
-experiment.optimize()
-experiment.run()
-experiment.report()

+import argparse
+from qwenimage.debug import clear_cuda_memory, print_gpu_memory
 from qwenimage.experiment import ExperimentConfig
+from qwenimage.experiments.experiments_qwen import PipeInputs, Qwen_AoT, QwenBaseExperiment, ExperimentRegistry
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--name", type=str, required=True)
+    parser.add_argument("--iterations", type=int, default=100)
+    args = parser.parse_args()
+    name = args.name
+    pipe_inputs = PipeInputs()
+    experiment = ExperimentRegistry.get(name)(
+        config=ExperimentConfig(
+            name=name,
+            iterations=args.iterations,
+        ),
+        pipe_inputs=pipe_inputs,
+    )
+    experiment.load()
+    experiment.optimize()
+    experiment.run()
+    experiment.report()
+if __name__ == "__main__":
+    main()

scripts/run_experiment_modal.py DELETED Viewed

@@ -1,21 +0,0 @@
-import modal
-from qwenimage.experiment import ExperimentConfig
-from qwenimage.experiments.experiments_qwen import QwenBaseExperiment
-app = modal.App("gradio-demo")
-app.image = (
-    modal.Image.debian_slim(python_version="3.10")
-    .apt_install("git", "ffmpeg", "libsm6", "libxext6")
-    .pip_install_from_requirements(os.path.abspath("./requirements.txt"))
-    .add_local_python_source("qwenimage")
-)
-experiment = QwenBaseExperiment(ExperimentConfig(name="qwen-base"))
-experiment.load()
-experiment.optimize()
-experiment.run()
-experiment.report()

scripts/run_multi.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import argparse
+import subprocess
+import sys
+from qwenimage.experiments.experiments_qwen import ExperimentRegistry
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--iterations", type=int, default=4)
+    args = parser.parse_args()
+    experiment_names = ExperimentRegistry.keys()
+    print(f"{len(experiment_names)}x {experiment_names}")
+    for name in experiment_names:
+        print(name)
+        cmd = [
+            sys.executable,
+            "scripts/run_experiment.py",
+            "--name", name,
+            "--iterations", str(args.iterations),
+        ]
+        result = subprocess.run(cmd, check=True, capture_output=False, text=True)
+        print(result)
+if __name__ == "__main__":
+    main()

scripts/run_multi_experiments.py DELETED Viewed

@@ -1,87 +0,0 @@
-import matplotlib.pyplot as plt
-import pandas as pd
-from qwenimage.debug import clear_cuda_memory, print_gpu_memory
-from qwenimage.experiment import ExperimentConfig
-from qwenimage.experiments.experiments_qwen import ExperimentRegistry, PipeInputs
-experiment_names = ExperimentRegistry.keys()
-print(experiment_names)
-pipe_inputs = PipeInputs()
-# Collect results from all experiments
-all_results = []
-for name in experiment_names:
-    print(f"Running {name}")
-    experiment = ExperimentRegistry.get(name)(
-        config=ExperimentConfig(
-            name=name,
-            iterations=10,
-        ),
-        pipe_inputs=pipe_inputs,
-    )
-    experiment.load()
-    experiment.optimize()
-    experiment.run()
-    base_df, base_raw_data = experiment.report()
-    # Add experiment name to the dataframe
-    base_df['experiment'] = name
-    all_results.append(base_df)
-    experiment.cleanup()
-    del experiment
-    clear_cuda_memory()
-    print_gpu_memory(clear_mem=None)
-# Combine all results
-combined_df = pd.concat(all_results, ignore_index=True)
-# Define desired names to plot
-desired_names = ["loop", "QwenBaseExperiment.run_once"]
-# Filter for desired names
-plot_data = combined_df[combined_df['name'].isin(desired_names)].copy()
-print(plot_data)
-# Sort by mean in descending order (rightmost = lowest mean)
-plot_data = plot_data.sort_values('mean', ascending=False)
-# Create bar plot
-fig, ax = plt.subplots(figsize=(12, 6))
-# Create x positions for bars
-x_pos = range(len(plot_data))
-# Plot bars with error bars
-bars = ax.bar(x_pos, plot_data['mean'], yerr=plot_data['std'],
-               capsize=5, alpha=0.7, edgecolor='black')
-# Customize plot
-ax.set_xlabel('Method', fontsize=12, fontweight='bold')
-ax.set_ylabel('Time (seconds)', fontsize=12, fontweight='bold')
-ax.set_title('Performance Comparison: Mean Execution Time with Standard Deviation',
-             fontsize=14, fontweight='bold')
-ax.set_xticks(x_pos)
-ax.set_xticklabels([f"{row['experiment']}\n{row['name']}"
-                     for _, row in plot_data.iterrows()],
-                    rotation=45, ha='right')
-ax.grid(axis='y', alpha=0.3)
-# Add value labels on top of bars
-for i, (idx, row) in enumerate(plot_data.iterrows()):
-    ax.text(i, row['mean'] + row['std'], f"{row['mean']:.3f}s",
-            ha='center', va='bottom', fontsize=9)
-plt.tight_layout()
-plt.savefig('reports/performance_comparison.png', dpi=300, bbox_inches='tight')
-print("\nPerformance comparison plot saved to: reports/performance_comparison.png")
-plt.show()