Qwen-Image-Edit-Angles

Running on Zero

App Files Files Community

Elea Zhong commited on 18 days ago

Commit

0365768

1 Parent(s): f9abc90

image context training

Browse files

Files changed (9) hide show

configs/base.yaml +1 -0
configs/style/lora-im2im.yaml +18 -0
configs/style/{lora-1.yaml → lora-naive.yaml} +6 -1
qwenimage/datamodels.py +13 -1
qwenimage/foundation.py +51 -39
qwenimage/{datasets.py → sources.py} +53 -0
qwenimage/task.py +1 -1
qwenimage/training.py +41 -6
scripts/train.ipynb +81 -12

configs/base.yaml CHANGED Viewed

@@ -15,6 +15,7 @@ num_workers: 4
 resume_from_checkpoint: null
 log_model_steps: 100
 preprocessing_epoch_len: 64
 # Logging

 resume_from_checkpoint: null
 log_model_steps: 100
 preprocessing_epoch_len: 64
+preprocessing_epoch_repetitions: 1
 # Logging

configs/style/lora-im2im.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+wandb_run_name: "lora-im2im"
+output_dir: "/data/checkpoints/lora-im2im"
+learning_rate: 1e-4
+num_train_epochs: 1
+max_train_steps: 1000
+preprocessing_epoch_len: 33
+preprocessing_epoch_repetitions: 31
+num_validation_images: 2
+num_sample_images: 2
+source_type: "im2im"
+style_title: "Simpsons"
+csv_path: "/data/chatgpt-style-transfer-data/output/results.csv"
+base_dir: "/data/chatgpt-style-transfer-data"
+train_range: [2, 35]
+test_range: [0, 2]
+val_with: test

configs/style/{lora-1.yaml → lora-naive.yaml} RENAMED Viewed

@@ -1,4 +1,9 @@
 wandb_run_name: "lora-naive"
 output_dir: "/data/checkpoints/lora-naive"
-learning_rate: 4e-4

 wandb_run_name: "lora-naive"
 output_dir: "/data/checkpoints/lora-naive"
+learning_rate: 4e-4
+source_type: "naive"
+data_dir: "/data/styles-finetune-data-artistic/tarot"
+prompt: "<0001>"
+ref_dir: "/data/image"

qwenimage/datamodels.py CHANGED Viewed

@@ -43,7 +43,19 @@ class QwenConfig(ExperimentTrainerParameters):
     static_mu: float | None = None
     loss_weight_dist: str | None = None # "scaled_clipped_gaussian", "logit-normal"
-    vae_image_size: int = 1024 * 1024
     offload_text_encoder: bool = True
     quantize_text_encoder: bool = False
     quantize_transformer: bool = False

     static_mu: float | None = None
     loss_weight_dist: str | None = None # "scaled_clipped_gaussian", "logit-normal"
+    vae_image_size: int = 512 * 512
     offload_text_encoder: bool = True
     quantize_text_encoder: bool = False
     quantize_transformer: bool = False
+    source_type: str = "im2im"
+    style_title: str|None = None
+    base_dir: str|None = None
+    csv_path: str|None = None
+    data_dir: str|None = None
+    ref_dir: str|None = None
+    prompt: str|None = None
+    train_range: tuple[int|float,int|float]|None=None
+    test_range: tuple[int|float,int|float]|None=None
+    val_with: str = "train"

qwenimage/foundation.py CHANGED Viewed

@@ -8,14 +8,14 @@ from diffusers.pipelines.qwenimage.pipeline_qwenimage import QwenImagePipeline
 import torch
 from safetensors.torch import load_file, save_model
 import torch.nn.functional as F
 from einops import rearrange
 from qwenimage.datamodels import QwenConfig, QwenInputs
 from qwenimage.debug import ctimed, ftimed, print_gpu_memory, texam
 from qwenimage.experiments.quantize_text_encoder_experiments import quantize_text_encoder_int4wo_linear
 from qwenimage.experiments.quantize_experiments import quantize_transformer_fp8darow_nolast
-from qwenimage.models.encode_prompt import encode_prompt
-from qwenimage.models.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 from qwenimage.models.transformer_qwenimage import QwenImageTransformer2DModel
 from qwenimage.optimization import simple_quantize_model
 from qwenimage.sampling import TimestepDistUtils
@@ -90,7 +90,6 @@ class QwenImageFoundation(WandModel):
             static_mu=self.config.static_mu,
             loss_weight_dist=self.config.loss_weight_dist,
         )
-        self.static_prompt_embeds = None
         if self.config.quantize_text_encoder:
             quantize_text_encoder_int4wo_linear(self.text_encoder)
@@ -131,8 +130,14 @@ class QwenImageFoundation(WandModel):
     def pil_to_latents(self, images):
         image = self.pipe.image_processor.preprocess(images)
-        print("pil_to_latents, image")
         texam(image)
         image = image.unsqueeze(2) # N, C, F=1, H, W
         image = image.to(device=self.device, dtype=self.dtype)
         latents = self.pipe.vae.encode(image).latent_dist.mode() # argmax
@@ -149,7 +154,7 @@ class QwenImageFoundation(WandModel):
         )
         latents = (latents - latents_mean) / latents_std
         latents = latents.squeeze(2)
-        print("pil_to_latents, latents")
         texam(latents)
         return latents.to(dtype=self.dtype)
@@ -185,29 +190,19 @@ class QwenImageFoundation(WandModel):
         latents = rearrange(packed, "b (h w) (c ph pw) -> b c (h ph) (w pw)", ph=2, pw=2, h=h, w=w)
         return latents
-    def set_static_prompt(self, prompt:str):
-        self.text_encoder.to(device=self.device)
-        if self.text_encoder_device != "cuda":
-            self.text_encoder_device = "cuda"
-        with torch.no_grad():
-            prompt_embeds, prompt_embeds_mask = encode_prompt(
-                self.text_encoder,
-                self.pipe.tokenizer,
-                prompt,
-                device=self.device,
-                dtype=self.dtype,
-                max_sequence_length = self.config.train_max_sequence_length,
-            )
-        prompt_embeds = prompt_embeds.cpu().clone().detach()
-        prompt_embeds_mask = prompt_embeds_mask.cpu().clone().detach()
-        self.static_prompt_embeds = (prompt_embeds, prompt_embeds_mask)
     @ftimed
     def preprocess_batch(self, batch):
         prompts = batch["text"]
-        if self.static_prompt_embeds is not None:
-            prompt_embeds, prompt_embeds_mask = self.static_prompt_embeds
         with ctimed("text_encoder.cuda()"):
             self.text_encoder.to(device=self.device)
@@ -215,26 +210,19 @@ class QwenImageFoundation(WandModel):
                 self.text_encoder_device = "cuda"
         with torch.no_grad():
-            prompt_embeds, prompt_embeds_mask = encode_prompt(
-                self.text_encoder,
-                self.pipe.tokenizer,
                 prompts,
-                device=self.device,
-                dtype=self.dtype,
                 max_sequence_length = self.config.train_max_sequence_length,
             )
-            # prompt_embeds, prompt_embeds_mask = foundation.pipe.encode_prompt(
-            #         inps[i]["prompt"],
-            #         _transforms(inps[i]["image"][0]).mul(255),
-            #         device="cuda",
-            #         # dtype=foundation.dtype,
-            #         max_sequence_length = foundation.config.train_max_sequence_length,
-            #     )
         prompt_embeds = prompt_embeds.cpu().clone().detach()
         prompt_embeds_mask = prompt_embeds_mask.cpu().clone().detach()
         batch["prompt_embeds"] = (prompt_embeds, prompt_embeds_mask)
         return batch
@@ -253,25 +241,42 @@ class QwenImageFoundation(WandModel):
         prompt_embeds = prompt_embeds.to(device=self.device)
         prompt_embeds_mask = prompt_embeds_mask.to(device=self.device)
-        images = batch["image"]
         x_0 = self.pil_to_latents(images).to(device=self.device, dtype=self.dtype)
         x_1 = torch.randn_like(x_0).to(device=self.device, dtype=self.dtype)
         seq_len = self.timestep_dist_utils.get_seq_len(x_0)
         batch_size = x_0.shape[0]
         t = self.timestep_dist_utils.get_train_t([batch_size], seq_len=seq_len).to(device=self.device, dtype=self.dtype)
         x_t = (1.0 - t) * x_0 + t * x_1
         x_t_1d = self.pack_latents(x_t)
         l_height, l_width = x_0.shape[-2:]
         img_shapes = [
-            [(1, l_height // 2, l_width // 2), ]
         ] * batch_size
         txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist()
         image_rotary_emb = self.transformer.pos_embed(img_shapes, txt_seq_lens, device=x_0.device)
         v_pred_1d = self.transformer(
-            hidden_states=x_t_1d,
             encoder_hidden_states=prompt_embeds,
             encoder_hidden_states_mask=prompt_embeds_mask,
             timestep=t,
@@ -279,6 +284,8 @@ class QwenImageFoundation(WandModel):
             return_dict=False,
         )[0]
         v_pred_2d = self.unpack_latents(v_pred_1d, h=l_height//2, w=l_width//2)
         v_gt_2d = x_1 - x_0
@@ -298,6 +305,11 @@ class QwenImageFoundation(WandModel):
             self.text_encoder.to(device=self.device)
             if self.text_encoder_device != "cuda":
                 self.text_encoder_device = "cuda"
         return self.pipe(**inputs.model_dump()).images

 import torch
 from safetensors.torch import load_file, save_model
 import torch.nn.functional as F
+import torchvision.transforms.v2.functional as TF
 from einops import rearrange
 from qwenimage.datamodels import QwenConfig, QwenInputs
 from qwenimage.debug import ctimed, ftimed, print_gpu_memory, texam
 from qwenimage.experiments.quantize_text_encoder_experiments import quantize_text_encoder_int4wo_linear
 from qwenimage.experiments.quantize_experiments import quantize_transformer_fp8darow_nolast
+from qwenimage.models.pipeline_qwenimage_edit_plus import CONDITION_IMAGE_SIZE, QwenImageEditPlusPipeline, calculate_dimensions
 from qwenimage.models.transformer_qwenimage import QwenImageTransformer2DModel
 from qwenimage.optimization import simple_quantize_model
 from qwenimage.sampling import TimestepDistUtils
             static_mu=self.config.static_mu,
             loss_weight_dist=self.config.loss_weight_dist,
         )
         if self.config.quantize_text_encoder:
             quantize_text_encoder_int4wo_linear(self.text_encoder)
     def pil_to_latents(self, images):
         image = self.pipe.image_processor.preprocess(images)
+        h,w = image.shape[-2:]
+        h_r, w_r = calculate_dimensions(self.config.vae_image_size, h/w)
+        image = TF.resize(image, (h_r, w_r))
+        print("pil_to_latents.image")
         texam(image)
         image = image.unsqueeze(2) # N, C, F=1, H, W
         image = image.to(device=self.device, dtype=self.dtype)
         latents = self.pipe.vae.encode(image).latent_dist.mode() # argmax
         )
         latents = (latents - latents_mean) / latents_std
         latents = latents.squeeze(2)
+        print("pil_to_latents.latents")
         texam(latents)
         return latents.to(dtype=self.dtype)
         latents = rearrange(packed, "b (h w) (c ph pw) -> b c (h ph) (w pw)", ph=2, pw=2, h=h, w=w)
         return latents
     @ftimed
     def preprocess_batch(self, batch):
         prompts = batch["text"]
+        references = batch["reference"]
+        h,w = references.shape[-2:]
+        h_r, w_r = calculate_dimensions(CONDITION_IMAGE_SIZE, h/w)
+        references = TF.resize(references, (h_r, w_r))
+        print("preprocess_batch.references")
+        texam(references)
         with ctimed("text_encoder.cuda()"):
             self.text_encoder.to(device=self.device)
                 self.text_encoder_device = "cuda"
         with torch.no_grad():
+            prompt_embeds, prompt_embeds_mask = self.pipe.encode_prompt(
                 prompts,
+                references.mul(255), # scaled to RGB
+                device="cuda",
                 max_sequence_length = self.config.train_max_sequence_length,
             )
         prompt_embeds = prompt_embeds.cpu().clone().detach()
         prompt_embeds_mask = prompt_embeds_mask.cpu().clone().detach()
         batch["prompt_embeds"] = (prompt_embeds, prompt_embeds_mask)
+        batch["reference"] = batch["reference"].cpu()
+        batch["image"] = batch["image"].cpu()
         return batch
         prompt_embeds = prompt_embeds.to(device=self.device)
         prompt_embeds_mask = prompt_embeds_mask.to(device=self.device)
+        images = batch["image"].to(device=self.device, dtype=self.dtype)
         x_0 = self.pil_to_latents(images).to(device=self.device, dtype=self.dtype)
         x_1 = torch.randn_like(x_0).to(device=self.device, dtype=self.dtype)
         seq_len = self.timestep_dist_utils.get_seq_len(x_0)
         batch_size = x_0.shape[0]
         t = self.timestep_dist_utils.get_train_t([batch_size], seq_len=seq_len).to(device=self.device, dtype=self.dtype)
         x_t = (1.0 - t) * x_0 + t * x_1
         x_t_1d = self.pack_latents(x_t)
+        references = batch["reference"].to(device=self.device, dtype=self.dtype)
+        print("references")
+        texam(references)
+        assert references.shape[0] == 1
+        refs = self.pil_to_latents(references).to(device=self.device, dtype=self.dtype)
+        refs_1d = self.pack_latents(refs)
+        print("refs refs_1d")
+        texam(refs)
+        texam(refs_1d)
+        inp_1d = torch.cat([x_t_1d, refs_1d], dim=1)
+        print("inp_1d")
+        texam(inp_1d)
         l_height, l_width = x_0.shape[-2:]
+        ref_height, ref_width = refs.shape[-2:]
         img_shapes = [
+            [
+                (1, l_height // 2, l_width // 2),
+                (1, ref_height // 2, ref_width // 2),
+            ]
         ] * batch_size
         txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist()
         image_rotary_emb = self.transformer.pos_embed(img_shapes, txt_seq_lens, device=x_0.device)
         v_pred_1d = self.transformer(
+            hidden_states=inp_1d,
             encoder_hidden_states=prompt_embeds,
             encoder_hidden_states_mask=prompt_embeds_mask,
             timestep=t,
             return_dict=False,
         )[0]
+        v_pred_1d = v_pred_1d[:, : x_t_1d.size(1)]
         v_pred_2d = self.unpack_latents(v_pred_1d, h=l_height//2, w=l_width//2)
         v_gt_2d = x_1 - x_0
             self.text_encoder.to(device=self.device)
             if self.text_encoder_device != "cuda":
                 self.text_encoder_device = "cuda"
+        image = inputs.image[0]
+        w,h = image.size
+        h_r, w_r = calculate_dimensions(self.config.vae_image_size, h/w)
+        image = TF.resize(image, (h_r, w_r))
+        inputs.image = [image]
         return self.pipe(**inputs.model_dump()).images

qwenimage/{datasets.py → sources.py} RENAMED Viewed

@@ -1,5 +1,6 @@
 from pathlib import Path
 import random
@@ -54,3 +55,55 @@ class StyleSourceWithRandomRef(Source):
         rand_ref = random.choice(self.ref_images)
         ref_pil = Image.open(rand_ref).convert("RGB")
         return im_pil, self.prompt, ref_pil

+import csv
 from pathlib import Path
 import random
         rand_ref = random.choice(self.ref_images)
         ref_pil = Image.open(rand_ref).convert("RGB")
         return im_pil, self.prompt, ref_pil
+class StyleImagetoImageSource(Source):
+    _data_types = [
+        SourceDataType(name="text", type=str),
+        SourceDataType(name="image", type=Image.Image),
+        SourceDataType(name="reference", type=Image.Image),
+    ]
+    def __init__(self, csv_path, base_dir, style_title=None, data_range:tuple[int|float,int|float]|None=None):
+        self.csv_path = Path(csv_path)
+        self.base_dir = Path(base_dir)
+        self.style_title = style_title
+        self.data = []
+        with open(self.csv_path, 'r', encoding='utf-8') as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if self.style_title is not None and row['style_title'] != self.style_title:
+                    continue
+                input_image = self.base_dir / row['input_image']
+                output_image = self.base_dir / row['output_image_path']
+                self.data.append({
+                    'input_image': input_image,
+                    'output_image': output_image,
+                    'style_title': row['style_title'],
+                    'prompt': row['prompt']
+                })
+        if data_range is not None:
+            left, right = data_range
+            if (isinstance(left, float) or isinstance(right, float)) and (left<1 and right<1):
+                left = left * len(self.data)
+                right = right * len(self.data)
+            remain_data = []
+            for i, d in enumerate(self.data):
+                if left <= i and i < right:
+                    remain_data.append(d)
+            self.data = remain_data
+        print(f"{self.__class__} of len{len(self)}")
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        prompt = item["prompt"]
+        input_pil = Image.open(item['input_image']).convert("RGB")
+        output_pil = Image.open(item['output_image']).convert("RGB")
+        return prompt, output_pil, input_pil

qwenimage/task.py CHANGED Viewed

@@ -11,7 +11,7 @@ image_transforms = T.Compose([
     RemoveAlphaTransform(bg_color_rgb=(34, 34, 34)),
     T.ToImage(),
     T.RGB(),
-    RandomDownsize(sizes=(384, 512, 768)),
     T.ToDtype(torch.float, scale=True),
 ])

     RemoveAlphaTransform(bg_color_rgb=(34, 34, 34)),
     T.ToImage(),
     T.RGB(),
+    # RandomDownsize(sizes=(384, 512, 768)),
     T.ToDtype(torch.float, scale=True),
 ])

qwenimage/training.py CHANGED Viewed

@@ -17,7 +17,7 @@ from wandml.trainers.experiment_trainer import ExperimentTrainer
 from qwenimage.finetuner import QwenLoraFinetuner
-from qwenimage.datasets import StyleSourceWithRandomRef
 from qwenimage.task import TextToImageWithRefTask
 from qwenimage.datamodels import QwenConfig
 from qwenimage.foundation import QwenImageFoundation
@@ -50,11 +50,32 @@ def run_training(config_path: Path | str, update_config_paths: list[Path] | None
     )
     # Data
-    src = StyleSourceWithRandomRef("/data/styles-finetune-data-artistic/tarot", "<0001>", "/data/image", set_len=1000)
-    task = TextToImageWithRefTask()
     dp = WandDataPipe()
-    dp.add_source(src)
-    dp.set_task(task)
     # Model
@@ -63,7 +84,21 @@ def run_training(config_path: Path | str, update_config_paths: list[Path] | None
     finetuner.load(None)
-    trainer = ExperimentTrainer(foundation,dp,config)
     trainer.train()

 from qwenimage.finetuner import QwenLoraFinetuner
+from qwenimage.sources import StyleSourceWithRandomRef, StyleImagetoImageSource
 from qwenimage.task import TextToImageWithRefTask
 from qwenimage.datamodels import QwenConfig
 from qwenimage.foundation import QwenImageFoundation
     )
     # Data
     dp = WandDataPipe()
+    dp.set_task(TextToImageWithRefTask())
+    dp_test = WandDataPipe()
+    dp_test.set_task(TextToImageWithRefTask())
+    if config.source_type == "naive":
+        src = StyleSourceWithRandomRef(
+            config.data_dir, config.prompt, config.ref_dir, set_len=config.max_train_steps
+        )
+        dp.add_source(src)
+    elif config.source_type == "im2im":
+        src = StyleImagetoImageSource(
+            csv_path=config.csv_path,
+            base_dir=config.base_dir,
+            style_title=config.style_title,
+            data_range=config.train_range,
+        )
+        dp.add_source(src)
+        src_test = StyleImagetoImageSource(
+            csv_path=config.csv_path,
+            base_dir=config.base_dir,
+            style_title=config.style_title,
+            data_range=config.test_range,
+        )
+        dp_test.add_source(src_test)
+    else:
+        raise ValueError()
     # Model
     finetuner.load(None)
+    if len(dp_test) == 0:
+        dp_test = None
+    if config.val_with == "train":
+        dp_val = dp
+    elif config.val_with == "test":
+        dp_val = dp_test
+    else:
+        raise ValueError()
+    trainer = ExperimentTrainer(
+        model=foundation,
+        datapipe=dp,
+        args=config,
+        validation_datapipe=dp_val,
+        test_datapipe=dp_test,
+    )
     trainer.train()

scripts/train.ipynb CHANGED Viewed

@@ -30,16 +30,16 @@
      "text": [
       "/usr/lib/python3/dist-packages/sklearn/utils/fixes.py:25: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n",
       "  from pkg_resources import parse_version  # type: ignore\n",
-      "2025-11-22 18:13:10.673389: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2025-11-22 18:13:10.687858: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
       "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
-      "E0000 00:00:1763835190.705243 2236633 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
-      "E0000 00:00:1763835190.710795 2236633 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
-      "W0000 00:00:1763835190.724588 2236633 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
-      "W0000 00:00:1763835190.724603 2236633 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
-      "W0000 00:00:1763835190.724605 2236633 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
-      "W0000 00:00:1763835190.724607 2236633 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
-      "2025-11-22 18:13:10.729261: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
      ]
     },
@@ -129,6 +129,20 @@
       "and open an issue at: https://github.com/bitsandbytes-foundation/bitsandbytes/issues\n",
       "\n"
      ]
     }
    ],
    "source": [
@@ -137,15 +151,70 @@
     "from pathlib import Path\n",
     "import argparse\n",
     "\n",
-    "from ruamel.yaml import YAML\n",
     "import diffusers\n",
     "\n",
     "\n",
     "from wandml.trainers.experiment_trainer import ExperimentTrainer\n",
     "from wandml import WandDataPipe\n",
     "import wandml\n",
     "\n",
-    "from qwenimage.finetuner import QwenLoraFinetuner\n"
    ]
   },
   {
@@ -521,7 +590,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
   }
  },
  "nbformat": 4,

      "text": [
       "/usr/lib/python3/dist-packages/sklearn/utils/fixes.py:25: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.\n",
       "  from pkg_resources import parse_version  # type: ignore\n",
+      "2025-11-23 10:48:20.190181: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2025-11-23 10:48:20.204255: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
       "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
+      "E0000 00:00:1763894900.221429 2465541 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "E0000 00:00:1763894900.227066 2465541 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "W0000 00:00:1763894900.240375 2465541 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
+      "W0000 00:00:1763894900.240390 2465541 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
+      "W0000 00:00:1763894900.240392 2465541 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
+      "W0000 00:00:1763894900.240394 2465541 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
+      "2025-11-23 10:48:20.244577: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
       "To enable the following instructions: AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
      ]
     },
       "and open an issue at: https://github.com/bitsandbytes-foundation/bitsandbytes/issues\n",
       "\n"
      ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f70e31b9ba79496a921f0e7d0cddfed4",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
     "from pathlib import Path\n",
     "import argparse\n",
     "\n",
+    "import yaml\n",
     "import diffusers\n",
     "\n",
     "\n",
     "from wandml.trainers.experiment_trainer import ExperimentTrainer\n",
     "from wandml import WandDataPipe\n",
     "import wandml\n",
+    "from wandml import WandAuth\n",
+    "from wandml import utils as wandml_utils\n",
+    "from wandml.trainers.datamodels import ExperimentTrainerParameters\n",
+    "from wandml.trainers.experiment_trainer import ExperimentTrainer\n",
+    "\n",
     "\n",
+    "from qwenimage.finetuner import QwenLoraFinetuner\n",
+    "from qwenimage.sources import StyleSourceWithRandomRef, StyleImagetoImageSource\n",
+    "from qwenimage.task import TextToImageWithRefTask\n",
+    "from qwenimage.datamodels import QwenConfig\n",
+    "from qwenimage.foundation import QwenImageFoundation\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "18bf116a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'qwenimage.sources.StyleImagetoImageSource'> of len2\n"
+     ]
+    }
+   ],
+   "source": [
+    "src = StyleImagetoImageSource(\n",
+    "    csv_path=\"/data/chatgpt-style-transfer-data/output/results.csv\",\n",
+    "    base_dir=\"/data/chatgpt-style-transfer-data\",\n",
+    "    style_title=\"Simpsons\",\n",
+    "    data_range=[2, 35],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'qwenimage.sources.StyleImagetoImageSource'> of len33\n"
+     ]
+    }
+   ],
+   "source": [
+    "src = StyleImagetoImageSource(\n",
+    "    csv_path=\"/data/chatgpt-style-transfer-data/output/results.csv\",\n",
+    "    base_dir=\"/data/chatgpt-style-transfer-data\",\n",
+    "    style_title=\"Simpsons\",\n",
+    "    data_range=[0, 2],\n",
+    ")"
    ]
   },
   {
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,