Elea Zhong commited on
Commit
87c9ae9
·
1 Parent(s): 65e075c

run experiments

Browse files
app.py CHANGED
@@ -18,6 +18,7 @@ from torchao.quantization import quantize_
18
  from torchao.quantization import Int8WeightOnlyConfig
19
 
20
  from qwenimage.debug import ftimed
 
21
  from qwenimage.optimization import optimize_pipeline_
22
  from qwenimage.prompt import build_camera_prompt
23
  from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
@@ -28,49 +29,10 @@ from qwenimage.models.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
28
  dtype = torch.bfloat16
29
  device = "cuda" if torch.cuda.is_available() else "cpu"
30
 
31
- pipe = QwenImageEditPlusPipeline.from_pretrained("Qwen/Qwen-Image-Edit-2509",
32
- transformer= QwenImageTransformer2DModel.from_pretrained("linoyts/Qwen-Image-Edit-Rapid-AIO",
33
- subfolder='transformer',
34
- torch_dtype=dtype,
35
- device_map='cuda'),torch_dtype=dtype).to(device)
36
-
37
- pipe.load_lora_weights(
38
- "dx8152/Qwen-Edit-2509-Multiple-angles",
39
- weight_name="镜头转换.safetensors", adapter_name="angles"
40
- )
41
-
42
- # pipe.load_lora_weights(
43
- # "lovis93/next-scene-qwen-image-lora-2509",
44
- # weight_name="next-scene_lora-v2-3000.safetensors", adapter_name="next-scene"
45
- # )
46
- pipe.set_adapters(["angles"], adapter_weights=[1.])
47
- pipe.fuse_lora(adapter_names=["angles"], lora_scale=1.25)
48
- # pipe.fuse_lora(adapter_names=["next-scene"], lora_scale=1.)
49
- pipe.unload_lora_weights()
50
-
51
-
52
- pipe.transformer.__class__ = QwenImageTransformer2DModel
53
- pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
54
-
55
- # transformer_clone = copy.deepcopy(pipe.transformer)
56
-
57
- # quantize_(pipe.transformer, Int8WeightOnlyConfig())
58
- # torch.save(pipe.transformer.state_dict(), "transformer_int8.pt")
59
- # assert False
60
-
61
- # from torchao.quantization import Int8DynamicActivationInt4WeightConfig
62
- # quantize_(pipe.transformer, Int8DynamicActivationInt4WeightConfig())
63
-
64
- optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024))], prompt="prompt", height=1024, width=1024, num_inference_steps=4)
65
-
66
- # state_dict = torch.load("transformer_int8.pt")
67
- # print(state_dict.keys())
68
- # # state_dict = pipe.transformer.state_dict()
69
- # print(pipe.transformer.state_dict().keys())
70
- # zerogpu_weights = ZeroGPUWeights({name: weight for name, weight in state_dict.items()})
71
- # compiled_transformer = ZeroGPUCompiledModel("transformer.pt2", zerogpu_weights)
72
-
73
- # spaces.aoti_apply(compiled_transformer, pipe.transformer)
74
 
75
 
76
  MAX_SEED = np.iinfo(np.int32).max
 
18
  from torchao.quantization import Int8WeightOnlyConfig
19
 
20
  from qwenimage.debug import ftimed
21
+ from qwenimage.experiments.experiments_qwen import Qwen_FA3_AoT_int8, Qwen_int4
22
  from qwenimage.optimization import optimize_pipeline_
23
  from qwenimage.prompt import build_camera_prompt
24
  from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 
29
  dtype = torch.bfloat16
30
  device = "cuda" if torch.cuda.is_available() else "cpu"
31
 
32
+ exp = Qwen_FA3_AoT_int8()
33
+ exp.load()
34
+ exp.optimize()
35
+ pipe = exp.pipe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
 
38
  MAX_SEED = np.iinfo(np.int32).max
qwenimage/experiments/experiments_qwen.py CHANGED
@@ -4,16 +4,26 @@ import os
4
  from pathlib import Path
5
  import random
6
  import statistics
 
 
7
  import torch
8
  from PIL import Image
9
  import pandas as pd
 
 
 
 
 
 
 
10
 
 
11
  from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
12
  from qwenimage.models.transformer_qwenimage import QwenImageTransformer2DModel
13
  from qwenimage.models.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
14
  from qwenimage.experiment import AbstractExperiment, ExperimentConfig
15
  from qwenimage.debug import ProfileSession, ftimed
16
- from qwenimage.optimization import optimize_pipeline_
17
  from qwenimage.prompt import build_camera_prompt
18
 
19
 
@@ -124,9 +134,6 @@ class QwenBaseExperiment(AbstractExperiment):
124
 
125
  @ftimed
126
  def optimize(self):
127
- # pipe.transformer.__class__ = QwenImageTransformer2DModel
128
- # pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
129
- # optimize_pipeline_(pipe, image=[Image.new("RGB", (1024, 1024))], prompt="prompt", height=1024, width=1024, num_inference_steps=4)
130
  pass
131
 
132
  @ftimed
@@ -181,7 +188,7 @@ class Qwen_FA3(QwenBaseExperiment):
181
  class Qwen_AoT(QwenBaseExperiment):
182
  @ftimed
183
  def optimize(self):
184
- self.compiled_transformer = optimize_pipeline_(
185
  self.pipe,
186
  cache_compiled=self.config.cache_compiled,
187
  quantize=False,
@@ -191,10 +198,7 @@ class Qwen_AoT(QwenBaseExperiment):
191
  "num_inference_steps":4
192
  }
193
  )
194
-
195
- def cleanup(self):
196
- super().cleanup()
197
- del self.compiled_transformer
198
 
199
  @ExperimentRegistry.register(name="qwen_fa3_aot")
200
  class Qwen_FA3_AoT(QwenBaseExperiment):
@@ -202,7 +206,7 @@ class Qwen_FA3_AoT(QwenBaseExperiment):
202
  def optimize(self):
203
  self.pipe.transformer.__class__ = QwenImageTransformer2DModel
204
  self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
205
- self.compiled_transformer = optimize_pipeline_(
206
  self.pipe,
207
  cache_compiled=self.config.cache_compiled,
208
  quantize=False,
@@ -214,9 +218,6 @@ class Qwen_FA3_AoT(QwenBaseExperiment):
214
  }
215
  )
216
 
217
- def cleanup(self):
218
- super().cleanup()
219
- del self.compiled_transformer
220
 
221
  @ExperimentRegistry.register(name="qwen_fa3_aot_int8")
222
  class Qwen_FA3_AoT_int8(QwenBaseExperiment):
@@ -224,7 +225,7 @@ class Qwen_FA3_AoT_int8(QwenBaseExperiment):
224
  def optimize(self):
225
  self.pipe.transformer.__class__ = QwenImageTransformer2DModel
226
  self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
227
- self.compiled_transformer = optimize_pipeline_(
228
  self.pipe,
229
  cache_compiled=self.config.cache_compiled,
230
  quantize=True,
@@ -236,6 +237,21 @@ class Qwen_FA3_AoT_int8(QwenBaseExperiment):
236
  }
237
  )
238
 
239
- def cleanup(self):
240
- super().cleanup()
241
- del self.compiled_transformer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from pathlib import Path
5
  import random
6
  import statistics
7
+ import os
8
+
9
  import torch
10
  from PIL import Image
11
  import pandas as pd
12
+ from spaces.zero.torch.aoti import ZeroGPUCompiledModel, ZeroGPUWeights
13
+ from torchao.quantization import Float8WeightOnlyConfig, Int4WeightOnlyConfig, Int8DynamicActivationInt4WeightConfig, Int8DynamicActivationInt8WeightConfig, quantize_
14
+ from torchao.quantization import Int8WeightOnlyConfig
15
+ import spaces
16
+ import torch
17
+ from torch.utils._pytree import tree_map
18
+ from torchao.utils import get_model_size_in_bytes
19
 
20
+ from qwenimage.debug import ftimed, print_first_param
21
  from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
22
  from qwenimage.models.transformer_qwenimage import QwenImageTransformer2DModel
23
  from qwenimage.models.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
24
  from qwenimage.experiment import AbstractExperiment, ExperimentConfig
25
  from qwenimage.debug import ProfileSession, ftimed
26
+ from qwenimage.optimization import INDUCTOR_CONFIGS, TRANSFORMER_DYNAMIC_SHAPES, aoti_apply, drain_module_parameters, optimize_pipeline_
27
  from qwenimage.prompt import build_camera_prompt
28
 
29
 
 
134
 
135
  @ftimed
136
  def optimize(self):
 
 
 
137
  pass
138
 
139
  @ftimed
 
188
  class Qwen_AoT(QwenBaseExperiment):
189
  @ftimed
190
  def optimize(self):
191
+ optimize_pipeline_(
192
  self.pipe,
193
  cache_compiled=self.config.cache_compiled,
194
  quantize=False,
 
198
  "num_inference_steps":4
199
  }
200
  )
201
+
 
 
 
202
 
203
  @ExperimentRegistry.register(name="qwen_fa3_aot")
204
  class Qwen_FA3_AoT(QwenBaseExperiment):
 
206
  def optimize(self):
207
  self.pipe.transformer.__class__ = QwenImageTransformer2DModel
208
  self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
209
+ optimize_pipeline_(
210
  self.pipe,
211
  cache_compiled=self.config.cache_compiled,
212
  quantize=False,
 
218
  }
219
  )
220
 
 
 
 
221
 
222
  @ExperimentRegistry.register(name="qwen_fa3_aot_int8")
223
  class Qwen_FA3_AoT_int8(QwenBaseExperiment):
 
225
  def optimize(self):
226
  self.pipe.transformer.__class__ = QwenImageTransformer2DModel
227
  self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
228
+ optimize_pipeline_(
229
  self.pipe,
230
  cache_compiled=self.config.cache_compiled,
231
  quantize=True,
 
237
  }
238
  )
239
 
240
+
241
+ @ExperimentRegistry.register(name="qwen_fp8")
242
+ class Qwen_fp8(QwenBaseExperiment):
243
+ @ftimed
244
+ def optimize(self):
245
+ self.pipe.transformer.__class__ = QwenImageTransformer2DModel
246
+ self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
247
+ quantize_(self.pipe.transformer, Float8WeightOnlyConfig())
248
+
249
+
250
+ @ExperimentRegistry.register(name="qwen_int8")
251
+ class Qwen_int8(QwenBaseExperiment):
252
+ @ftimed
253
+ def optimize(self):
254
+ self.pipe.transformer.__class__ = QwenImageTransformer2DModel
255
+ self.pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())
256
+ quantize_(self.pipe.transformer, Int8WeightOnlyConfig())
257
+
qwenimage/models/pipeline_qwenimage_edit_plus.py CHANGED
@@ -876,11 +876,12 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
876
  if XLA_AVAILABLE:
877
  xm.mark_step()
878
 
879
- with ctimed("Post (vae)"):
880
- self._current_timestep = None
881
- if output_type == "latent":
882
- image = latents
883
- else:
 
884
  latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
885
  latents = latents.to(self.vae.dtype)
886
  latents_mean = (
@@ -892,14 +893,17 @@ class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
892
  latents.device, latents.dtype
893
  )
894
  latents = latents / latents_std + latents_mean
895
- with ctimed("vae.decode"):
896
- image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
 
897
  image = self.image_processor.postprocess(image, output_type=output_type)
898
 
899
- # Offload all models
 
 
900
  self.maybe_free_model_hooks()
901
 
902
- if not return_dict:
903
- return (image,)
904
 
905
  return QwenImagePipelineOutput(images=image)
 
876
  if XLA_AVAILABLE:
877
  xm.mark_step()
878
 
879
+ # with ctimed("Post (vae)"):
880
+ self._current_timestep = None
881
+ if output_type == "latent":
882
+ image = latents
883
+ else:
884
+ with ctimed("pre decode"):
885
  latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
886
  latents = latents.to(self.vae.dtype)
887
  latents_mean = (
 
893
  latents.device, latents.dtype
894
  )
895
  latents = latents / latents_std + latents_mean
896
+ with ctimed("vae.decode"):
897
+ image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
898
+ with ctimed("post process"):
899
  image = self.image_processor.postprocess(image, output_type=output_type)
900
 
901
+
902
+ # Offload all models
903
+ with ctimed("offload"):
904
  self.maybe_free_model_hooks()
905
 
906
+ if not return_dict:
907
+ return (image,)
908
 
909
  return QwenImagePipelineOutput(images=image)
qwenimage/optimization.py CHANGED
@@ -68,6 +68,7 @@ def drain_module_parameters(module: torch.nn.Module):
68
 
69
 
70
  @ftimed
 
71
  def optimize_pipeline_(
72
  pipeline: Callable[P, Any],
73
  cache_compiled=True,
@@ -116,4 +117,3 @@ def optimize_pipeline_(
116
 
117
 
118
  aoti_apply(compiled_transformer, pipeline.transformer)
119
- return compiled_transformer
 
68
 
69
 
70
  @ftimed
71
+ @spaces.GPU(duration=1500)
72
  def optimize_pipeline_(
73
  pipeline: Callable[P, Any],
74
  cache_compiled=True,
 
117
 
118
 
119
  aoti_apply(compiled_transformer, pipeline.transformer)
 
scripts/plot_data.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
scripts/run_experiment.py CHANGED
@@ -1,19 +1,33 @@
1
 
 
2
 
3
-
4
  from qwenimage.experiment import ExperimentConfig
5
- from qwenimage.experiments.experiments_qwen import Qwen_AoT, QwenBaseExperiment, ExperimentRegistry
 
6
 
7
- # experiment = QwenBaseExperiment(ExperimentConfig(name="qwen-base"))
8
- # experiment.load()
9
- # experiment.optimize()
10
- # experiment.run()
11
- # experiment.report()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- experiment = Qwen_AoT(ExperimentConfig(name="qwen-aot"))
14
- experiment.load()
15
- experiment.optimize()
16
- experiment.run()
17
- experiment.report()
18
 
 
 
19
 
 
1
 
2
+ import argparse
3
 
4
+ from qwenimage.debug import clear_cuda_memory, print_gpu_memory
5
  from qwenimage.experiment import ExperimentConfig
6
+ from qwenimage.experiments.experiments_qwen import PipeInputs, Qwen_AoT, QwenBaseExperiment, ExperimentRegistry
7
+
8
 
9
+ def main():
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument("--name", type=str, required=True)
12
+ parser.add_argument("--iterations", type=int, default=100)
13
+ args = parser.parse_args()
14
+
15
+ name = args.name
16
+
17
+ pipe_inputs = PipeInputs()
18
+ experiment = ExperimentRegistry.get(name)(
19
+ config=ExperimentConfig(
20
+ name=name,
21
+ iterations=args.iterations,
22
+ ),
23
+ pipe_inputs=pipe_inputs,
24
+ )
25
+ experiment.load()
26
+ experiment.optimize()
27
+ experiment.run()
28
+ experiment.report()
29
 
 
 
 
 
 
30
 
31
+ if __name__ == "__main__":
32
+ main()
33
 
scripts/run_experiment_modal.py DELETED
@@ -1,21 +0,0 @@
1
-
2
- import modal
3
-
4
- from qwenimage.experiment import ExperimentConfig
5
- from qwenimage.experiments.experiments_qwen import QwenBaseExperiment
6
-
7
-
8
- app = modal.App("gradio-demo")
9
- app.image = (
10
- modal.Image.debian_slim(python_version="3.10")
11
- .apt_install("git", "ffmpeg", "libsm6", "libxext6")
12
- .pip_install_from_requirements(os.path.abspath("./requirements.txt"))
13
- .add_local_python_source("qwenimage")
14
- )
15
-
16
- experiment = QwenBaseExperiment(ExperimentConfig(name="qwen-base"))
17
-
18
- experiment.load()
19
- experiment.optimize()
20
- experiment.run()
21
- experiment.report()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
scripts/run_multi.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import subprocess
3
+ import sys
4
+
5
+ from qwenimage.experiments.experiments_qwen import ExperimentRegistry
6
+
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument("--iterations", type=int, default=4)
11
+ args = parser.parse_args()
12
+
13
+ experiment_names = ExperimentRegistry.keys()
14
+ print(f"{len(experiment_names)}x {experiment_names}")
15
+
16
+ for name in experiment_names:
17
+ print(name)
18
+
19
+ cmd = [
20
+ sys.executable,
21
+ "scripts/run_experiment.py",
22
+ "--name", name,
23
+ "--iterations", str(args.iterations),
24
+ ]
25
+
26
+ result = subprocess.run(cmd, check=True, capture_output=False, text=True)
27
+ print(result)
28
+
29
+
30
+ if __name__ == "__main__":
31
+ main()
32
+
scripts/run_multi_experiments.py DELETED
@@ -1,87 +0,0 @@
1
-
2
- import matplotlib.pyplot as plt
3
- import pandas as pd
4
-
5
- from qwenimage.debug import clear_cuda_memory, print_gpu_memory
6
- from qwenimage.experiment import ExperimentConfig
7
- from qwenimage.experiments.experiments_qwen import ExperimentRegistry, PipeInputs
8
-
9
- experiment_names = ExperimentRegistry.keys()
10
- print(experiment_names)
11
-
12
- pipe_inputs = PipeInputs()
13
-
14
- # Collect results from all experiments
15
- all_results = []
16
-
17
- for name in experiment_names:
18
- print(f"Running {name}")
19
- experiment = ExperimentRegistry.get(name)(
20
- config=ExperimentConfig(
21
- name=name,
22
- iterations=10,
23
- ),
24
- pipe_inputs=pipe_inputs,
25
- )
26
- experiment.load()
27
- experiment.optimize()
28
- experiment.run()
29
- base_df, base_raw_data = experiment.report()
30
-
31
- # Add experiment name to the dataframe
32
- base_df['experiment'] = name
33
- all_results.append(base_df)
34
-
35
- experiment.cleanup()
36
- del experiment
37
-
38
- clear_cuda_memory()
39
-
40
- print_gpu_memory(clear_mem=None)
41
-
42
- # Combine all results
43
- combined_df = pd.concat(all_results, ignore_index=True)
44
-
45
- # Define desired names to plot
46
- desired_names = ["loop", "QwenBaseExperiment.run_once"]
47
-
48
- # Filter for desired names
49
- plot_data = combined_df[combined_df['name'].isin(desired_names)].copy()
50
-
51
- print(plot_data)
52
-
53
- # Sort by mean in descending order (rightmost = lowest mean)
54
- plot_data = plot_data.sort_values('mean', ascending=False)
55
-
56
- # Create bar plot
57
- fig, ax = plt.subplots(figsize=(12, 6))
58
-
59
- # Create x positions for bars
60
- x_pos = range(len(plot_data))
61
-
62
- # Plot bars with error bars
63
- bars = ax.bar(x_pos, plot_data['mean'], yerr=plot_data['std'],
64
- capsize=5, alpha=0.7, edgecolor='black')
65
-
66
- # Customize plot
67
- ax.set_xlabel('Method', fontsize=12, fontweight='bold')
68
- ax.set_ylabel('Time (seconds)', fontsize=12, fontweight='bold')
69
- ax.set_title('Performance Comparison: Mean Execution Time with Standard Deviation',
70
- fontsize=14, fontweight='bold')
71
- ax.set_xticks(x_pos)
72
- ax.set_xticklabels([f"{row['experiment']}\n{row['name']}"
73
- for _, row in plot_data.iterrows()],
74
- rotation=45, ha='right')
75
- ax.grid(axis='y', alpha=0.3)
76
-
77
- # Add value labels on top of bars
78
- for i, (idx, row) in enumerate(plot_data.iterrows()):
79
- ax.text(i, row['mean'] + row['std'], f"{row['mean']:.3f}s",
80
- ha='center', va='bottom', fontsize=9)
81
-
82
- plt.tight_layout()
83
- plt.savefig('reports/performance_comparison.png', dpi=300, bbox_inches='tight')
84
- print("\nPerformance comparison plot saved to: reports/performance_comparison.png")
85
- plt.show()
86
-
87
-