Spaces:
Runtime error
Runtime error
Commit
·
f96f677
0
Parent(s):
test
Browse files- .gitattributes +36 -0
- README.md +13 -0
- __pycache__/dreamfuse_inference.cpython-310.pyc +0 -0
- app.py +491 -0
- dreamfuse/.DS_Store +0 -0
- dreamfuse/models/dreamfuse_flux/__pycache__/flux_processor.cpython-310.pyc +0 -0
- dreamfuse/models/dreamfuse_flux/__pycache__/transformer.cpython-310.pyc +0 -0
- dreamfuse/models/dreamfuse_flux/flux_processor.py +269 -0
- dreamfuse/models/dreamfuse_flux/transformer.py +866 -0
- dreamfuse/trains/utils/__pycache__/inference_utils.cpython-310.pyc +0 -0
- dreamfuse/trains/utils/inference_utils.py +386 -0
- dreamfuse_inference.py +642 -0
- examples/9_01.png +3 -0
- examples/9_02.png +3 -0
- output_images/no_bg_image.png +3 -0
- requirements.txt +37 -0
.gitattributes
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: DreamFuse
|
| 3 |
+
emoji: 📚
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.24.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__pycache__/dreamfuse_inference.cpython-310.pyc
ADDED
|
Binary file (14.1 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import spaces
|
| 3 |
+
from PIL import Image, ImageDraw, ImageOps
|
| 4 |
+
import base64, json
|
| 5 |
+
from io import BytesIO
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
import json
|
| 8 |
+
from typing import List
|
| 9 |
+
from dataclasses import dataclass, field
|
| 10 |
+
from dreamfuse_inference import DreamFuseInference, InferenceConfig
|
| 11 |
+
import numpy as np
|
| 12 |
+
import os
|
| 13 |
+
from transformers import AutoModelForImageSegmentation
|
| 14 |
+
from torchvision import transforms
|
| 15 |
+
import torch
|
| 16 |
+
import subprocess
|
| 17 |
+
subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)
|
| 18 |
+
generated_images = []
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
RMBG_model = AutoModelForImageSegmentation.from_pretrained('briaai/RMBG-2.0', trust_remote_code=True)
|
| 22 |
+
RMBG_model = RMBG_model.to("cuda")
|
| 23 |
+
transform = transforms.Compose([
|
| 24 |
+
transforms.Resize((1024, 1024)),
|
| 25 |
+
transforms.ToTensor(),
|
| 26 |
+
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
|
| 27 |
+
])
|
| 28 |
+
|
| 29 |
+
@spaces.GPU
|
| 30 |
+
def remove_bg(image):
|
| 31 |
+
im = image.convert("RGB")
|
| 32 |
+
input_tensor = transform(im).unsqueeze(0).to("cuda")
|
| 33 |
+
with torch.no_grad():
|
| 34 |
+
preds = RMBG_model(input_tensor)[-1].sigmoid().cpu()[0].squeeze()
|
| 35 |
+
mask = transforms.ToPILImage()(preds).resize(im.size)
|
| 36 |
+
return mask
|
| 37 |
+
|
| 38 |
+
class DreamblendGUI:
|
| 39 |
+
def __init__(self):
|
| 40 |
+
self.examples = [
|
| 41 |
+
["./examples/9_02.png",
|
| 42 |
+
"./examples/9_01.png"],
|
| 43 |
+
]
|
| 44 |
+
self.examples = [[Image.open(x) for x in example] for example in self.examples]
|
| 45 |
+
self.css_style = self._get_css_style()
|
| 46 |
+
self.js_script = self._get_js_script()
|
| 47 |
+
|
| 48 |
+
def _get_css_style(self):
|
| 49 |
+
return """
|
| 50 |
+
body {
|
| 51 |
+
background: transparent;
|
| 52 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
| 53 |
+
color: #fff;
|
| 54 |
+
}
|
| 55 |
+
.gradio-container {
|
| 56 |
+
max-width: 1200px;
|
| 57 |
+
margin: auto;
|
| 58 |
+
background: transparent;
|
| 59 |
+
border-radius: 10px;
|
| 60 |
+
padding: 20px;
|
| 61 |
+
box-shadow: 0px 2px 8px rgba(255,255,255,0.1);
|
| 62 |
+
}
|
| 63 |
+
h1, h2 {
|
| 64 |
+
text-align: center;
|
| 65 |
+
color: #fff;
|
| 66 |
+
}
|
| 67 |
+
#canvas_preview {
|
| 68 |
+
border: 2px dashed rgba(255,255,255,0.5);
|
| 69 |
+
padding: 10px;
|
| 70 |
+
background: transparent;
|
| 71 |
+
border-radius: 8px;
|
| 72 |
+
}
|
| 73 |
+
.gr-button {
|
| 74 |
+
background-color: #007bff;
|
| 75 |
+
border: none;
|
| 76 |
+
color: #fff;
|
| 77 |
+
padding: 10px 20px;
|
| 78 |
+
border-radius: 5px;
|
| 79 |
+
font-size: 16px;
|
| 80 |
+
cursor: pointer;
|
| 81 |
+
}
|
| 82 |
+
.gr-button:hover {
|
| 83 |
+
background-color: #0056b3;
|
| 84 |
+
}
|
| 85 |
+
#small-examples {
|
| 86 |
+
max-width: 200px !important;
|
| 87 |
+
width: 200px !important;
|
| 88 |
+
float: left;
|
| 89 |
+
margin-right: 20px;
|
| 90 |
+
}
|
| 91 |
+
"""
|
| 92 |
+
|
| 93 |
+
def _get_js_script(self):
|
| 94 |
+
return r"""
|
| 95 |
+
async () => {
|
| 96 |
+
window.updateTransformation = function() {
|
| 97 |
+
const img = document.getElementById('draggable-img');
|
| 98 |
+
const container = document.getElementById('canvas-container');
|
| 99 |
+
if (!img || !container) return;
|
| 100 |
+
const left = parseFloat(img.style.left) || 0;
|
| 101 |
+
const top = parseFloat(img.style.top) || 0;
|
| 102 |
+
|
| 103 |
+
const canvasSize = 400;
|
| 104 |
+
const data_original_width = parseFloat(img.getAttribute('data-original-width'));
|
| 105 |
+
const data_original_height = parseFloat(img.getAttribute('data-original-height'));
|
| 106 |
+
const bgWidth = parseFloat(container.dataset.bgWidth);
|
| 107 |
+
const bgHeight = parseFloat(container.dataset.bgHeight);
|
| 108 |
+
const scale_ratio = img.clientWidth / data_original_width;
|
| 109 |
+
|
| 110 |
+
const transformation = {
|
| 111 |
+
drag_left: left,
|
| 112 |
+
drag_top: top,
|
| 113 |
+
drag_width: img.clientWidth,
|
| 114 |
+
drag_height: img.clientHeight,
|
| 115 |
+
data_original_width: data_original_width,
|
| 116 |
+
data_original_height: data_original_height,
|
| 117 |
+
scale_ratio: scale_ratio
|
| 118 |
+
};
|
| 119 |
+
|
| 120 |
+
const transInput = document.querySelector("#transformation_info textarea");
|
| 121 |
+
if(transInput){
|
| 122 |
+
const newValue = JSON.stringify(transformation);
|
| 123 |
+
const nativeSetter = Object.getOwnPropertyDescriptor(window.HTMLTextAreaElement.prototype, 'value').set;
|
| 124 |
+
nativeSetter.call(transInput, newValue);
|
| 125 |
+
transInput.dispatchEvent(new Event('input', { bubbles: true }));
|
| 126 |
+
console.log("Transformation info updated: ", newValue);
|
| 127 |
+
} else {
|
| 128 |
+
console.log("找不到 transformation_info 的 textarea 元素");
|
| 129 |
+
}
|
| 130 |
+
};
|
| 131 |
+
|
| 132 |
+
globalThis.initializeDrag = () => {
|
| 133 |
+
console.log("初始化拖拽与缩放功能...");
|
| 134 |
+
const observer = new MutationObserver(() => {
|
| 135 |
+
const img = document.getElementById('draggable-img');
|
| 136 |
+
const container = document.getElementById('canvas-container');
|
| 137 |
+
const slider = document.getElementById('scale-slider');
|
| 138 |
+
if (img && container && slider) {
|
| 139 |
+
observer.disconnect();
|
| 140 |
+
console.log("绑定拖拽与缩放事件...");
|
| 141 |
+
img.ondragstart = (e) => { e.preventDefault(); return false; };
|
| 142 |
+
let offsetX = 0, offsetY = 0;
|
| 143 |
+
let isDragging = false;
|
| 144 |
+
let scaleAnchor = null;
|
| 145 |
+
|
| 146 |
+
img.addEventListener('mousedown', (e) => {
|
| 147 |
+
isDragging = true;
|
| 148 |
+
img.style.cursor = 'grabbing';
|
| 149 |
+
const imgRect = img.getBoundingClientRect();
|
| 150 |
+
offsetX = e.clientX - imgRect.left;
|
| 151 |
+
offsetY = e.clientY - imgRect.top;
|
| 152 |
+
img.style.transform = "none";
|
| 153 |
+
img.style.left = img.offsetLeft + "px";
|
| 154 |
+
img.style.top = img.offsetTop + "px";
|
| 155 |
+
console.log("mousedown: left=", img.style.left, "top=", img.style.top);
|
| 156 |
+
});
|
| 157 |
+
document.addEventListener('mousemove', (e) => {
|
| 158 |
+
if (!isDragging) return;
|
| 159 |
+
e.preventDefault();
|
| 160 |
+
|
| 161 |
+
const containerRect = container.getBoundingClientRect();
|
| 162 |
+
// 计算当前拖拽后的坐标(基于容器)
|
| 163 |
+
let left = e.clientX - containerRect.left - offsetX;
|
| 164 |
+
let top = e.clientY - containerRect.top - offsetY;
|
| 165 |
+
|
| 166 |
+
// 允许的拖拽范围:
|
| 167 |
+
// 水平方向允许最少超出图像一半:最小值为 -img.clientWidth * (7/8)
|
| 168 |
+
// 水平方向允许最多超出一半:最大值为 containerRect.width - img.clientWidth * (1/8)
|
| 169 |
+
const minLeft = -img.clientWidth * (7/8);
|
| 170 |
+
const maxLeft = containerRect.width - img.clientWidth * (1/8);
|
| 171 |
+
|
| 172 |
+
// 垂直方向允许范围:
|
| 173 |
+
// 最小值为 -img.clientHeight * (7/8)
|
| 174 |
+
// 最大值为 containerRect.height - img.clientHeight * (1/8)
|
| 175 |
+
const minTop = -img.clientHeight * (7/8);
|
| 176 |
+
const maxTop = containerRect.height - img.clientHeight * (1/8);
|
| 177 |
+
|
| 178 |
+
// 限制范围
|
| 179 |
+
if (left < minLeft) left = minLeft;
|
| 180 |
+
if (left > maxLeft) left = maxLeft;
|
| 181 |
+
|
| 182 |
+
if (top < minTop) top = minTop;
|
| 183 |
+
if (top > maxTop) top = maxTop;
|
| 184 |
+
|
| 185 |
+
img.style.left = left + "px";
|
| 186 |
+
img.style.top = top + "px";
|
| 187 |
+
});
|
| 188 |
+
|
| 189 |
+
window.addEventListener('mouseup', (e) => {
|
| 190 |
+
if (isDragging) {
|
| 191 |
+
isDragging = false;
|
| 192 |
+
img.style.cursor = 'grab';
|
| 193 |
+
const containerRect = container.getBoundingClientRect();
|
| 194 |
+
const bgWidth = parseFloat(container.dataset.bgWidth);
|
| 195 |
+
const bgHeight = parseFloat(container.dataset.bgHeight);
|
| 196 |
+
const offsetLeft = (containerRect.width - bgWidth) / 2;
|
| 197 |
+
const offsetTop = (containerRect.height - bgHeight) / 2;
|
| 198 |
+
const absoluteLeft = parseFloat(img.style.left);
|
| 199 |
+
const absoluteTop = parseFloat(img.style.top);
|
| 200 |
+
const relativeX = absoluteLeft - offsetLeft;
|
| 201 |
+
const relativeY = absoluteTop - offsetTop;
|
| 202 |
+
document.getElementById("coordinate").textContent =
|
| 203 |
+
`前景图坐标: (x=${relativeX.toFixed(2)}, y=${relativeY.toFixed(2)})`;
|
| 204 |
+
updateTransformation();
|
| 205 |
+
}
|
| 206 |
+
scaleAnchor = null;
|
| 207 |
+
});
|
| 208 |
+
|
| 209 |
+
slider.addEventListener('mousedown', (e) => {
|
| 210 |
+
const containerRect = container.getBoundingClientRect();
|
| 211 |
+
const imgRect = img.getBoundingClientRect();
|
| 212 |
+
scaleAnchor = {
|
| 213 |
+
x: imgRect.left + imgRect.width/2 - containerRect.left,
|
| 214 |
+
y: imgRect.top + imgRect.height/2 - containerRect.top
|
| 215 |
+
};
|
| 216 |
+
console.log("Slider mousedown, captured scaleAnchor: ", scaleAnchor);
|
| 217 |
+
});
|
| 218 |
+
|
| 219 |
+
slider.addEventListener('input', (e) => {
|
| 220 |
+
const scale = parseFloat(e.target.value);
|
| 221 |
+
const originalWidth = parseFloat(img.getAttribute('data-original-width'));
|
| 222 |
+
const originalHeight = parseFloat(img.getAttribute('data-original-height'));
|
| 223 |
+
const newWidth = originalWidth * scale;
|
| 224 |
+
const newHeight = originalHeight * scale;
|
| 225 |
+
const containerRect = container.getBoundingClientRect();
|
| 226 |
+
let centerX, centerY;
|
| 227 |
+
if (scaleAnchor) {
|
| 228 |
+
centerX = scaleAnchor.x;
|
| 229 |
+
centerY = scaleAnchor.y;
|
| 230 |
+
} else {
|
| 231 |
+
const imgRect = img.getBoundingClientRect();
|
| 232 |
+
centerX = imgRect.left + imgRect.width/2 - containerRect.left;
|
| 233 |
+
centerY = imgRect.top + imgRect.height/2 - containerRect.top;
|
| 234 |
+
}
|
| 235 |
+
const newLeft = centerX - newWidth/2;
|
| 236 |
+
const newTop = centerY - newHeight/2;
|
| 237 |
+
img.style.width = newWidth + "px";
|
| 238 |
+
img.style.height = newHeight + "px";
|
| 239 |
+
img.style.left = newLeft + "px";
|
| 240 |
+
img.style.top = newTop + "px";
|
| 241 |
+
console.log("slider: scale=", scale, "newWidth=", newWidth, "newHeight=", newHeight);
|
| 242 |
+
updateTransformation();
|
| 243 |
+
});
|
| 244 |
+
|
| 245 |
+
slider.addEventListener('mouseup', (e) => {
|
| 246 |
+
scaleAnchor = null;
|
| 247 |
+
});
|
| 248 |
+
}
|
| 249 |
+
});
|
| 250 |
+
observer.observe(document.body, { childList: true, subtree: true });
|
| 251 |
+
};
|
| 252 |
+
}
|
| 253 |
+
"""
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def get_next_sequence(self, folder_path):
|
| 257 |
+
# 列出文件夹中的所有文件名
|
| 258 |
+
filenames = os.listdir(folder_path)
|
| 259 |
+
# 提取文件名中的序列号部分(假设是前三位数字)
|
| 260 |
+
sequences = [int(name.split('_')[0]) for name in filenames if name.split('_')[0].isdigit()]
|
| 261 |
+
# 找到最大序列号
|
| 262 |
+
max_sequence = max(sequences, default=-1)
|
| 263 |
+
# 返回下一位序列号,格式为三位数字(如002)
|
| 264 |
+
return f"{max_sequence + 1:03d}"
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def pil_to_base64(self, img):
|
| 268 |
+
"""将 PIL Image 转为 base64 字符串,PNG 格式下保留透明通道"""
|
| 269 |
+
if img is None:
|
| 270 |
+
return ""
|
| 271 |
+
if img.mode != "RGBA":
|
| 272 |
+
img = img.convert("RGBA")
|
| 273 |
+
buffered = BytesIO()
|
| 274 |
+
img.save(buffered, format="PNG", optimize=True)
|
| 275 |
+
img_bytes = buffered.getvalue()
|
| 276 |
+
base64_str = base64.b64encode(img_bytes).decode()
|
| 277 |
+
return f"data:image/png;base64,{base64_str}"
|
| 278 |
+
|
| 279 |
+
def resize_background_image(self, img, max_size=400):
|
| 280 |
+
"""将背景图等比例缩放到最长边为 max_size(400)"""
|
| 281 |
+
if img is None:
|
| 282 |
+
return None
|
| 283 |
+
w, h = img.size
|
| 284 |
+
if w > max_size or h > max_size:
|
| 285 |
+
ratio = min(max_size / w, max_size / h)
|
| 286 |
+
new_w, new_h = int(w * ratio), int(h * ratio)
|
| 287 |
+
img = img.resize((new_w, new_h), Image.LANCZOS)
|
| 288 |
+
return img
|
| 289 |
+
|
| 290 |
+
def resize_draggable_image(self, img, max_size=400):
|
| 291 |
+
"""将前景图等比例缩放到最长边不超过 max_size(400)"""
|
| 292 |
+
if img is None:
|
| 293 |
+
return None
|
| 294 |
+
w, h = img.size
|
| 295 |
+
if w > max_size or h > max_size:
|
| 296 |
+
ratio = min(max_size / w, max_size / h)
|
| 297 |
+
new_w, new_h = int(w * ratio), int(h * ratio)
|
| 298 |
+
img = img.resize((new_w, new_h), Image.LANCZOS)
|
| 299 |
+
return img
|
| 300 |
+
|
| 301 |
+
def generate_html(self, background_img_b64, bg_width, bg_height, draggable_img_b64, draggable_width, draggable_height, canvas_size=400):
|
| 302 |
+
"""生成预览 HTML 页面"""
|
| 303 |
+
html_code = f"""
|
| 304 |
+
<html>
|
| 305 |
+
<head>
|
| 306 |
+
<style>
|
| 307 |
+
body {{
|
| 308 |
+
margin: 0;
|
| 309 |
+
padding: 0;
|
| 310 |
+
text-align: center;
|
| 311 |
+
font-family: sans-serif;
|
| 312 |
+
background: transparent;
|
| 313 |
+
color: #fff;
|
| 314 |
+
}}
|
| 315 |
+
h2 {{
|
| 316 |
+
margin-top: 1rem;
|
| 317 |
+
}}
|
| 318 |
+
#scale-control {{
|
| 319 |
+
margin: 1rem auto;
|
| 320 |
+
width: 400px;
|
| 321 |
+
text-align: left;
|
| 322 |
+
}}
|
| 323 |
+
#scale-control label {{
|
| 324 |
+
font-size: 1rem;
|
| 325 |
+
margin-right: 0.5rem;
|
| 326 |
+
}}
|
| 327 |
+
#canvas-container {{
|
| 328 |
+
position: relative;
|
| 329 |
+
width: {canvas_size}px;
|
| 330 |
+
height: {canvas_size}px;
|
| 331 |
+
margin: 0 auto;
|
| 332 |
+
border: 1px dashed rgba(255,255,255,0.5);
|
| 333 |
+
overflow: hidden;
|
| 334 |
+
background-image: url('{background_img_b64}');
|
| 335 |
+
background-repeat: no-repeat;
|
| 336 |
+
background-position: center;
|
| 337 |
+
background-size: contain;
|
| 338 |
+
border-radius: 8px;
|
| 339 |
+
}}
|
| 340 |
+
#draggable-img {{
|
| 341 |
+
position: absolute;
|
| 342 |
+
cursor: grab;
|
| 343 |
+
left: 50%;
|
| 344 |
+
top: 50%;
|
| 345 |
+
transform: translate(-50%, -50%);
|
| 346 |
+
background-color: transparent;
|
| 347 |
+
}}
|
| 348 |
+
#coordinate {{
|
| 349 |
+
color: #fff;
|
| 350 |
+
margin-top: 1rem;
|
| 351 |
+
font-weight: bold;
|
| 352 |
+
}}
|
| 353 |
+
</style>
|
| 354 |
+
</head>
|
| 355 |
+
<body>
|
| 356 |
+
<h2>拖拽前景图(支持缩放)</h2>
|
| 357 |
+
<div id="scale-control">
|
| 358 |
+
<label for="scale-slider">前景图缩放:</label>
|
| 359 |
+
<input type="range" id="scale-slider" min="0.1" max="2" step="0.01" value="1">
|
| 360 |
+
</div>
|
| 361 |
+
<div id="canvas-container" data-bg-width="{bg_width}" data-bg-height="{bg_height}">
|
| 362 |
+
<img id="draggable-img"
|
| 363 |
+
src="{draggable_img_b64}"
|
| 364 |
+
alt="Draggable Image"
|
| 365 |
+
draggable="false"
|
| 366 |
+
data-original-width="{draggable_width}"
|
| 367 |
+
data-original-height="{draggable_height}"
|
| 368 |
+
/>
|
| 369 |
+
</div>
|
| 370 |
+
<p id="coordinate">前景图坐标: (x=?, y=?)</p>
|
| 371 |
+
</body>
|
| 372 |
+
</html>
|
| 373 |
+
"""
|
| 374 |
+
return html_code
|
| 375 |
+
|
| 376 |
+
def on_upload(self, background_img, draggable_img):
|
| 377 |
+
"""上传图片后的处理"""
|
| 378 |
+
if background_img is None or draggable_img is None:
|
| 379 |
+
return "<p style='color:red;'>请先上传背景图片和可拖拽图片。</p>"
|
| 380 |
+
|
| 381 |
+
if draggable_img.mode != "RGB":
|
| 382 |
+
draggable_img = draggable_img.convert("RGB")
|
| 383 |
+
draggable_img_mask = remove_bg(draggable_img)
|
| 384 |
+
alpha_channel = draggable_img_mask.convert("L")
|
| 385 |
+
draggable_img = draggable_img.convert("RGBA")
|
| 386 |
+
draggable_img.putalpha(alpha_channel)
|
| 387 |
+
|
| 388 |
+
resized_bg = self.resize_background_image(background_img, max_size=400)
|
| 389 |
+
bg_w, bg_h = resized_bg.size
|
| 390 |
+
|
| 391 |
+
resized_fg = self.resize_draggable_image(draggable_img, max_size=400)
|
| 392 |
+
draggable_width, draggable_height = resized_fg.size
|
| 393 |
+
|
| 394 |
+
background_img_b64 = self.pil_to_base64(resized_bg)
|
| 395 |
+
draggable_img_b64 = self.pil_to_base64(resized_fg)
|
| 396 |
+
|
| 397 |
+
return self.generate_html(
|
| 398 |
+
background_img_b64, bg_w, bg_h,
|
| 399 |
+
draggable_img_b64, draggable_width, draggable_height,
|
| 400 |
+
canvas_size=400
|
| 401 |
+
), draggable_img
|
| 402 |
+
|
| 403 |
+
def save_image(self, save_path = "/mnt/bn/hjj-humanseg-lq/SubjectDriven/DreamFuse/debug"):
|
| 404 |
+
global generated_images
|
| 405 |
+
save_name = self.get_next_sequence(save_path)
|
| 406 |
+
generated_images[0].save(os.path.join(save_path, f"{save_name}_0_ori.png"))
|
| 407 |
+
generated_images[1].save(os.path.join(save_path, f"{save_name}_0.png"))
|
| 408 |
+
generated_images[2].save(os.path.join(save_path, f"{save_name}_1.png"))
|
| 409 |
+
generated_images[3].save(os.path.join(save_path, f"{save_name}_2.png"))
|
| 410 |
+
generated_images[4].save(os.path.join(save_path, f"{save_name}_0_mask.png"))
|
| 411 |
+
generated_images[5].save(os.path.join(save_path, f"{save_name}_0_mask_scale.png"))
|
| 412 |
+
generated_images[6].save(os.path.join(save_path, f"{save_name}_0_scale.png"))
|
| 413 |
+
generated_images[7].save(os.path.join(save_path, f"{save_name}_2_pasted.png"))
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
def create_gui(self):
|
| 417 |
+
config = InferenceConfig()
|
| 418 |
+
config.lora_id = 'LL3RD/DreamFuse'
|
| 419 |
+
|
| 420 |
+
pipeline = DreamFuseInference(config)
|
| 421 |
+
pipeline.gradio_generate = spaces.GPU(duratioin=120)(pipeline.gradio_generate)
|
| 422 |
+
"""创建 Gradio 界面"""
|
| 423 |
+
with gr.Blocks(css=self.css_style) as demo:
|
| 424 |
+
modified_fg_state = gr.State()
|
| 425 |
+
gr.Markdown("# Dreamblend-GUI-dirtydata")
|
| 426 |
+
gr.Markdown("通过上传背景图与前景图生成带有可拖拽/缩放预览的合成图像,同时支持 Seed 设置和 Prompt 文本输入。")
|
| 427 |
+
with gr.Row():
|
| 428 |
+
with gr.Column(scale=1):
|
| 429 |
+
gr.Markdown("### 上传图片")
|
| 430 |
+
background_img_in = gr.Image(label="背景图片", type="pil", height=240, width=240)
|
| 431 |
+
draggable_img_in = gr.Image(label="前景图片", type="pil", image_mode="RGBA", height=240, width=240)
|
| 432 |
+
generate_btn = gr.Button("生成可拖拽画布")
|
| 433 |
+
|
| 434 |
+
with gr.Row():
|
| 435 |
+
gr.Examples(
|
| 436 |
+
examples=[self.examples[0]],
|
| 437 |
+
inputs=[background_img_in, draggable_img_in],
|
| 438 |
+
elem_id="small-examples"
|
| 439 |
+
)
|
| 440 |
+
with gr.Column(scale=1):
|
| 441 |
+
gr.Markdown("### 预览区域")
|
| 442 |
+
html_out = gr.HTML(label="预览与拖拽", elem_id="canvas_preview")
|
| 443 |
+
|
| 444 |
+
with gr.Row():
|
| 445 |
+
with gr.Column(scale=1):
|
| 446 |
+
gr.Markdown("### 参数设置")
|
| 447 |
+
seed_slider = gr.Slider(minimum=0, maximum=10000, step=1, label="Seed", value=42)
|
| 448 |
+
cfg_slider = gr.Slider(minimum=1, maximum=10, step=0.1, label="CFG", value=3.5)
|
| 449 |
+
size_select = gr.Radio(
|
| 450 |
+
choices=["512", "768", "1024"],
|
| 451 |
+
value="512",
|
| 452 |
+
label="生成质量(512-差 1024-好)",
|
| 453 |
+
)
|
| 454 |
+
prompt_text = gr.Textbox(label="Prompt", placeholder="输入文本提示", value="")
|
| 455 |
+
text_strength = gr.Slider(minimum=1, maximum=10, step=1, label="Text Strength", value=1)
|
| 456 |
+
enable_gui = gr.Checkbox(label="启用GUI", value=True)
|
| 457 |
+
enable_truecfg = gr.Checkbox(label="启用TrueCFG", value=False)
|
| 458 |
+
enable_save = gr.Button("保存图片 (内部测试)", visible=True)
|
| 459 |
+
with gr.Column(scale=1):
|
| 460 |
+
gr.Markdown("### 模型生成结果")
|
| 461 |
+
model_generate_btn = gr.Button("模型生成")
|
| 462 |
+
transformation_text = gr.Textbox(label="Transformation Info", elem_id="transformation_info", visible=False)
|
| 463 |
+
model_output = gr.Image(label="模型输出", type="pil")
|
| 464 |
+
|
| 465 |
+
# 交互事件绑定
|
| 466 |
+
enable_save.click(fn=self.save_image, inputs=None, outputs=None)
|
| 467 |
+
generate_btn.click(
|
| 468 |
+
fn=self.on_upload,
|
| 469 |
+
inputs=[background_img_in, draggable_img_in],
|
| 470 |
+
outputs=[html_out, modified_fg_state],
|
| 471 |
+
)
|
| 472 |
+
model_generate_btn.click(
|
| 473 |
+
fn=pipeline.gradio_generate,
|
| 474 |
+
inputs=[background_img_in, modified_fg_state, transformation_text, seed_slider, \
|
| 475 |
+
prompt_text, enable_gui, cfg_slider, size_select, text_strength, enable_truecfg],
|
| 476 |
+
outputs=model_output
|
| 477 |
+
)
|
| 478 |
+
# 页面加载后初始化拖拽/缩放事件
|
| 479 |
+
demo.load(None, None, None, js=self.js_script)
|
| 480 |
+
generate_btn.click(fn=None, inputs=None, outputs=None, js="initializeDrag")
|
| 481 |
+
|
| 482 |
+
return demo
|
| 483 |
+
|
| 484 |
+
if __name__ == "__main__":
|
| 485 |
+
|
| 486 |
+
gui = DreamblendGUI()
|
| 487 |
+
demo = gui.create_gui()
|
| 488 |
+
demo.queue()
|
| 489 |
+
demo.launch()
|
| 490 |
+
# demo.launch(server_port=7789, ssr_mode=False)
|
| 491 |
+
# demo.launch(server_name="[::]", share=True)
|
dreamfuse/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
dreamfuse/models/dreamfuse_flux/__pycache__/flux_processor.cpython-310.pyc
ADDED
|
Binary file (7.61 kB). View file
|
|
|
dreamfuse/models/dreamfuse_flux/__pycache__/transformer.cpython-310.pyc
ADDED
|
Binary file (23.9 kB). View file
|
|
|
dreamfuse/models/dreamfuse_flux/flux_processor.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
+
import math
|
| 3 |
+
from typing import Callable, List, Optional, Tuple, Union
|
| 4 |
+
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
from torch import nn
|
| 8 |
+
|
| 9 |
+
from diffusers.image_processor import IPAdapterMaskProcessor
|
| 10 |
+
from diffusers.utils import deprecate, logging
|
| 11 |
+
from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
|
| 12 |
+
from diffusers.utils.torch_utils import is_torch_version, maybe_allow_in_graph
|
| 13 |
+
from diffusers.models.attention import Attention
|
| 14 |
+
from diffusers.models.embeddings import Timesteps, TimestepEmbedding, PixArtAlphaTextProjection
|
| 15 |
+
|
| 16 |
+
class CombinedTimestepGuidanceTextProjEmbeddings(nn.Module):
|
| 17 |
+
def __init__(self, embedding_dim, pooled_projection_dim):
|
| 18 |
+
super().__init__()
|
| 19 |
+
|
| 20 |
+
self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
|
| 21 |
+
self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
|
| 22 |
+
self.guidance_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
|
| 23 |
+
self.text_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
|
| 24 |
+
|
| 25 |
+
def forward(self, timestep, guidance, pooled_projection):
|
| 26 |
+
timesteps_proj = self.time_proj(timestep)
|
| 27 |
+
timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=pooled_projection.dtype)) # (N, D)
|
| 28 |
+
|
| 29 |
+
if (guidance >= 0).all():
|
| 30 |
+
guidance_proj = self.time_proj(guidance)
|
| 31 |
+
guidance_emb = self.guidance_embedder(guidance_proj.to(dtype=pooled_projection.dtype)) # (N, D)
|
| 32 |
+
|
| 33 |
+
time_guidance_emb = timesteps_emb + guidance_emb
|
| 34 |
+
|
| 35 |
+
pooled_projections = self.text_embedder(pooled_projection)
|
| 36 |
+
conditioning = time_guidance_emb + pooled_projections
|
| 37 |
+
else:
|
| 38 |
+
pooled_projections = self.text_embedder(pooled_projection)
|
| 39 |
+
conditioning = timesteps_emb + pooled_projections
|
| 40 |
+
|
| 41 |
+
return conditioning
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def apply_rotary_emb(
|
| 45 |
+
x: torch.Tensor,
|
| 46 |
+
freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
|
| 47 |
+
use_real: bool = True,
|
| 48 |
+
use_real_unbind_dim: int = -1,
|
| 49 |
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 50 |
+
"""
|
| 51 |
+
Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
|
| 52 |
+
to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
|
| 53 |
+
reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
|
| 54 |
+
tensors contain rotary embeddings and are returned as real tensors.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
x (`torch.Tensor`):
|
| 58 |
+
Query or key tensor to apply rotary embeddings. [B, H, S, D] xk (torch.Tensor): Key tensor to apply
|
| 59 |
+
freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
|
| 63 |
+
"""
|
| 64 |
+
if use_real:
|
| 65 |
+
cos, sin = freqs_cis # [S, D]
|
| 66 |
+
if cos.ndim == 2:
|
| 67 |
+
cos = cos[None, None]
|
| 68 |
+
else:
|
| 69 |
+
cos = cos.unsqueeze(1)
|
| 70 |
+
if sin.ndim == 2:
|
| 71 |
+
sin = sin[None, None]
|
| 72 |
+
else:
|
| 73 |
+
sin = sin.unsqueeze(1)
|
| 74 |
+
cos, sin = cos.to(x.device), sin.to(x.device)
|
| 75 |
+
|
| 76 |
+
if use_real_unbind_dim == -1:
|
| 77 |
+
# Used for flux, cogvideox, hunyuan-dit
|
| 78 |
+
x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1) # [B, S, H, D//2]
|
| 79 |
+
x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
|
| 80 |
+
elif use_real_unbind_dim == -2:
|
| 81 |
+
# Used for Stable Audio
|
| 82 |
+
x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2) # [B, S, H, D//2]
|
| 83 |
+
x_rotated = torch.cat([-x_imag, x_real], dim=-1)
|
| 84 |
+
else:
|
| 85 |
+
raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
|
| 86 |
+
|
| 87 |
+
out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
|
| 88 |
+
|
| 89 |
+
return out
|
| 90 |
+
else:
|
| 91 |
+
# used for lumina
|
| 92 |
+
x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
|
| 93 |
+
freqs_cis = freqs_cis.unsqueeze(2)
|
| 94 |
+
x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
|
| 95 |
+
|
| 96 |
+
return x_out.type_as(x)
|
| 97 |
+
|
| 98 |
+
class FluxAttnSharedProcessor2_0:
|
| 99 |
+
"""Attention processor used typically in processing the SD3-like self-attention projections."""
|
| 100 |
+
|
| 101 |
+
def __init__(self):
|
| 102 |
+
if not hasattr(F, "scaled_dot_product_attention"):
|
| 103 |
+
raise ImportError("FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
|
| 104 |
+
|
| 105 |
+
def __call__(
|
| 106 |
+
self,
|
| 107 |
+
attn: Attention,
|
| 108 |
+
hidden_states: torch.FloatTensor,
|
| 109 |
+
encoder_hidden_states: torch.FloatTensor = None,
|
| 110 |
+
attention_mask: Optional[torch.FloatTensor] = None,
|
| 111 |
+
image_rotary_emb: Optional[torch.Tensor] = None,
|
| 112 |
+
data_num_per_group: Optional[int] = 1,
|
| 113 |
+
max_sequence_length: Optional[int] = 512,
|
| 114 |
+
mix_attention: bool = True,
|
| 115 |
+
cond_latents = None,
|
| 116 |
+
cond_image_rotary_emb = None,
|
| 117 |
+
work_mode = None,
|
| 118 |
+
mask_cond = None,
|
| 119 |
+
) -> torch.FloatTensor:
|
| 120 |
+
with_cond = cond_latents is not None and mix_attention
|
| 121 |
+
|
| 122 |
+
batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
|
| 123 |
+
|
| 124 |
+
# `sample` projections.
|
| 125 |
+
query = attn.to_q(hidden_states)
|
| 126 |
+
key = attn.to_k(hidden_states)
|
| 127 |
+
value = attn.to_v(hidden_states)
|
| 128 |
+
|
| 129 |
+
inner_dim = key.shape[-1]
|
| 130 |
+
head_dim = inner_dim // attn.heads
|
| 131 |
+
|
| 132 |
+
query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
| 133 |
+
key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
| 134 |
+
value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
|
| 135 |
+
|
| 136 |
+
if attn.norm_q is not None:
|
| 137 |
+
query = attn.norm_q(query)
|
| 138 |
+
if attn.norm_k is not None:
|
| 139 |
+
key = attn.norm_k(key)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
# the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
|
| 143 |
+
if encoder_hidden_states is not None:
|
| 144 |
+
# `context` projections.
|
| 145 |
+
encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
|
| 146 |
+
encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
|
| 147 |
+
encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
|
| 148 |
+
|
| 149 |
+
encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
|
| 150 |
+
batch_size, -1, attn.heads, head_dim
|
| 151 |
+
).transpose(1, 2)
|
| 152 |
+
encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
|
| 153 |
+
batch_size, -1, attn.heads, head_dim
|
| 154 |
+
).transpose(1, 2)
|
| 155 |
+
encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
|
| 156 |
+
batch_size, -1, attn.heads, head_dim
|
| 157 |
+
).transpose(1, 2)
|
| 158 |
+
|
| 159 |
+
if attn.norm_added_q is not None:
|
| 160 |
+
encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
|
| 161 |
+
if attn.norm_added_k is not None:
|
| 162 |
+
encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
|
| 163 |
+
|
| 164 |
+
# attention
|
| 165 |
+
query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
|
| 166 |
+
key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
|
| 167 |
+
value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
|
| 168 |
+
|
| 169 |
+
if image_rotary_emb is not None:
|
| 170 |
+
query = apply_rotary_emb(query, image_rotary_emb)
|
| 171 |
+
key = apply_rotary_emb(key, image_rotary_emb)
|
| 172 |
+
|
| 173 |
+
if with_cond:
|
| 174 |
+
cond_bs = cond_latents.shape[0]
|
| 175 |
+
|
| 176 |
+
# update condition
|
| 177 |
+
cond_query = attn.to_q(cond_latents)
|
| 178 |
+
cond_query = cond_query.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
|
| 179 |
+
if attn.norm_q is not None:
|
| 180 |
+
cond_query = attn.norm_q(cond_query)
|
| 181 |
+
cond_query = apply_rotary_emb(cond_query, cond_image_rotary_emb)
|
| 182 |
+
cond_query = torch.cat(cond_query.chunk(len(cond_query), dim=0), dim=2)
|
| 183 |
+
|
| 184 |
+
cond_key = attn.to_k(cond_latents)
|
| 185 |
+
cond_value = attn.to_v(cond_latents)
|
| 186 |
+
cond_key = cond_key.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
|
| 187 |
+
cond_value = cond_value.view(cond_bs, -1, attn.heads, head_dim).transpose(1, 2)
|
| 188 |
+
if attn.norm_k is not None:
|
| 189 |
+
cond_key = attn.norm_k(cond_key)
|
| 190 |
+
|
| 191 |
+
cond_key = apply_rotary_emb(cond_key, cond_image_rotary_emb)
|
| 192 |
+
|
| 193 |
+
cond_key = torch.cat(cond_key.chunk(len(cond_key), dim=0), dim=2)
|
| 194 |
+
cond_value = torch.cat(cond_value.chunk(len(cond_value), dim=0), dim=2)
|
| 195 |
+
|
| 196 |
+
if data_num_per_group > 1 and mix_attention:
|
| 197 |
+
E = max_sequence_length # according to text len
|
| 198 |
+
|
| 199 |
+
key_enc, key_hid = key[:, :, :E], key[:, :, E:]
|
| 200 |
+
value_enc, value_hid = value[:, :, :E], value[:, :, E:]
|
| 201 |
+
|
| 202 |
+
key_layer = key_hid.chunk(data_num_per_group, dim=0)
|
| 203 |
+
key_layer = torch.cat(key_layer, dim=2).repeat(data_num_per_group, 1, 1, 1)
|
| 204 |
+
|
| 205 |
+
value_layer = value_hid.chunk(data_num_per_group, dim=0)
|
| 206 |
+
value_layer = torch.cat(value_layer, dim=2).repeat(data_num_per_group, 1, 1, 1)
|
| 207 |
+
|
| 208 |
+
key = torch.cat([key_enc, key_layer], dim=2)
|
| 209 |
+
value = torch.cat([value_enc, value_layer], dim=2)
|
| 210 |
+
|
| 211 |
+
elif data_num_per_group == 1 and mix_attention and with_cond:
|
| 212 |
+
E = max_sequence_length # according to text len
|
| 213 |
+
|
| 214 |
+
key_enc, key_hid = key[:, :, :E], key[:, :, E:]
|
| 215 |
+
value_enc, value_hid = value[:, :, :E], value[:, :, E:]
|
| 216 |
+
|
| 217 |
+
# todo: support bs != 1
|
| 218 |
+
key_layer = torch.cat([key_hid, cond_key], dim=2)
|
| 219 |
+
value_layer = torch.cat([value_hid, cond_value], dim=2)
|
| 220 |
+
|
| 221 |
+
key = torch.cat([key_enc, key_layer], dim=2)
|
| 222 |
+
value = torch.cat([value_enc, value_layer], dim=2)
|
| 223 |
+
|
| 224 |
+
# concat query
|
| 225 |
+
query_enc, query_hid = query[:, :, :E], query[:, :, E:]
|
| 226 |
+
query_layer = torch.cat([query_hid, cond_query], dim=2)
|
| 227 |
+
query = torch.cat([query_enc, query_layer], dim=2)
|
| 228 |
+
|
| 229 |
+
hidden_states = F.scaled_dot_product_attention(
|
| 230 |
+
query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,
|
| 231 |
+
)
|
| 232 |
+
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
|
| 233 |
+
hidden_states = hidden_states.to(query.dtype)
|
| 234 |
+
|
| 235 |
+
if encoder_hidden_states is not None:
|
| 236 |
+
if with_cond:
|
| 237 |
+
encoder_hidden_states, hidden_states, cond_latents = (
|
| 238 |
+
hidden_states[:, : encoder_hidden_states.shape[1]],
|
| 239 |
+
hidden_states[:, encoder_hidden_states.shape[1] : -cond_latents.shape[1]*cond_bs],
|
| 240 |
+
hidden_states[:, -cond_latents.shape[1]*cond_bs :],
|
| 241 |
+
)
|
| 242 |
+
cond_latents = cond_latents.view(cond_bs, cond_latents.shape[1] // cond_bs, cond_latents.shape[2])
|
| 243 |
+
cond_latents = attn.to_out[0](cond_latents)
|
| 244 |
+
cond_latents = attn.to_out[1](cond_latents)
|
| 245 |
+
else:
|
| 246 |
+
encoder_hidden_states, hidden_states = (
|
| 247 |
+
hidden_states[:, : encoder_hidden_states.shape[1]],
|
| 248 |
+
hidden_states[:, encoder_hidden_states.shape[1]:],
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# linear proj
|
| 252 |
+
hidden_states = attn.to_out[0](hidden_states)
|
| 253 |
+
# dropout
|
| 254 |
+
hidden_states = attn.to_out[1](hidden_states)
|
| 255 |
+
|
| 256 |
+
encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
|
| 257 |
+
|
| 258 |
+
if with_cond:
|
| 259 |
+
return hidden_states, encoder_hidden_states, cond_latents
|
| 260 |
+
return hidden_states, encoder_hidden_states
|
| 261 |
+
else:
|
| 262 |
+
if with_cond:
|
| 263 |
+
hidden_states, cond_latents = (
|
| 264 |
+
hidden_states[:, : -cond_latents.shape[1]*cond_bs],
|
| 265 |
+
hidden_states[:, -cond_latents.shape[1]*cond_bs :],
|
| 266 |
+
)
|
| 267 |
+
cond_latents = cond_latents.view(cond_bs, cond_latents.shape[1] // cond_bs, cond_latents.shape[2])
|
| 268 |
+
return hidden_states, cond_latents
|
| 269 |
+
return hidden_states
|
dreamfuse/models/dreamfuse_flux/transformer.py
ADDED
|
@@ -0,0 +1,866 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2024 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
from typing import Any, Dict, Optional, Tuple, Union
|
| 17 |
+
|
| 18 |
+
import numpy as np
|
| 19 |
+
import torch
|
| 20 |
+
import torch.nn as nn
|
| 21 |
+
import torch.nn.functional as F
|
| 22 |
+
|
| 23 |
+
from diffusers.configuration_utils import ConfigMixin, register_to_config
|
| 24 |
+
from diffusers.loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
|
| 25 |
+
from diffusers.models.attention import FeedForward
|
| 26 |
+
from diffusers.models.attention_processor import (
|
| 27 |
+
Attention,
|
| 28 |
+
AttentionProcessor,
|
| 29 |
+
FluxAttnProcessor2_0,
|
| 30 |
+
FluxAttnProcessor2_0_NPU,
|
| 31 |
+
FusedFluxAttnProcessor2_0,
|
| 32 |
+
)
|
| 33 |
+
from dreamfuse.models.dreamfuse_flux.flux_processor import FluxAttnSharedProcessor2_0
|
| 34 |
+
from diffusers.models.modeling_utils import ModelMixin
|
| 35 |
+
from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
|
| 36 |
+
from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
|
| 37 |
+
from diffusers.utils.import_utils import is_torch_npu_available
|
| 38 |
+
from diffusers.utils.torch_utils import maybe_allow_in_graph
|
| 39 |
+
from diffusers.models.embeddings import CombinedTimestepTextProjEmbeddings, FluxPosEmbed
|
| 40 |
+
from diffusers.models.modeling_outputs import Transformer2DModelOutput
|
| 41 |
+
|
| 42 |
+
from .flux_processor import CombinedTimestepGuidanceTextProjEmbeddings
|
| 43 |
+
|
| 44 |
+
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
| 45 |
+
|
| 46 |
+
def zero_module(module):
|
| 47 |
+
for p in module.parameters():
|
| 48 |
+
nn.init.zeros_(p)
|
| 49 |
+
return module
|
| 50 |
+
|
| 51 |
+
class LayerNorm2d(nn.Module):
|
| 52 |
+
def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
|
| 53 |
+
super().__init__()
|
| 54 |
+
self.weight = nn.Parameter(torch.ones(num_channels))
|
| 55 |
+
self.bias = nn.Parameter(torch.zeros(num_channels))
|
| 56 |
+
self.eps = eps
|
| 57 |
+
|
| 58 |
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 59 |
+
u = x.mean(1, keepdim=True)
|
| 60 |
+
s = (x - u).pow(2).mean(1, keepdim=True)
|
| 61 |
+
x = (x - u) / torch.sqrt(s + self.eps)
|
| 62 |
+
x = self.weight[:, None, None] * x + self.bias[:, None, None]
|
| 63 |
+
return x
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class CrossAttention(nn.Module):
|
| 67 |
+
def __init__(self, query_dim: int, cross_attention_dim: int, heads: int = 8, dim_head: int = 64, dropout: float = 0.0, bias: bool = False):
|
| 68 |
+
super().__init__()
|
| 69 |
+
self.heads = heads
|
| 70 |
+
self.dim_head = cross_attention_dim // heads
|
| 71 |
+
self.attn_to_q = nn.Linear(query_dim, cross_attention_dim, bias=bias)
|
| 72 |
+
self.norm_q = nn.LayerNorm(self.dim_head)
|
| 73 |
+
|
| 74 |
+
self.attn_to_k = nn.Linear(cross_attention_dim, cross_attention_dim, bias=bias)
|
| 75 |
+
self.norm_k = nn.LayerNorm(self.dim_head)
|
| 76 |
+
|
| 77 |
+
self.attn_to_v = nn.Linear(cross_attention_dim, cross_attention_dim, bias=bias)
|
| 78 |
+
|
| 79 |
+
self.attn_to_out = nn.ModuleList([])
|
| 80 |
+
self.attn_to_out.append(nn.Linear(query_dim, query_dim, bias=bias))
|
| 81 |
+
self.attn_to_out.append(nn.Dropout(dropout))
|
| 82 |
+
|
| 83 |
+
# zero init
|
| 84 |
+
with torch.no_grad():
|
| 85 |
+
self.attn_to_out[0].weight.fill_(0)
|
| 86 |
+
# self.to_out[0].bias.fill_(0)
|
| 87 |
+
|
| 88 |
+
def forward(self, hidden_states, encoder_hidden_states, attention_mask=None):
|
| 89 |
+
batch_size, sequence_length, _ = hidden_states.shape
|
| 90 |
+
|
| 91 |
+
query = self.attn_to_q(hidden_states)
|
| 92 |
+
key = self.attn_to_k(encoder_hidden_states)
|
| 93 |
+
value = self.attn_to_v(encoder_hidden_states)
|
| 94 |
+
|
| 95 |
+
inner_dim = key.shape[-1]
|
| 96 |
+
head_dim = inner_dim // self.heads
|
| 97 |
+
|
| 98 |
+
query = query.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
|
| 99 |
+
key = key.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
|
| 100 |
+
value = value.view(batch_size, -1, self.heads, head_dim).transpose(1, 2)
|
| 101 |
+
|
| 102 |
+
query = self.norm_q(query)
|
| 103 |
+
key = self.norm_k(key)
|
| 104 |
+
|
| 105 |
+
hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,)
|
| 106 |
+
hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.heads * head_dim)
|
| 107 |
+
|
| 108 |
+
hidden_states = self.attn_to_out[0](hidden_states)
|
| 109 |
+
hidden_states = self.attn_to_out[1](hidden_states)
|
| 110 |
+
|
| 111 |
+
return hidden_states
|
| 112 |
+
|
| 113 |
+
@maybe_allow_in_graph
|
| 114 |
+
class FluxSingleTransformerBlock(nn.Module):
|
| 115 |
+
r"""
|
| 116 |
+
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
|
| 117 |
+
|
| 118 |
+
Reference: https://arxiv.org/abs/2403.03206
|
| 119 |
+
|
| 120 |
+
Parameters:
|
| 121 |
+
dim (`int`): The number of channels in the input and output.
|
| 122 |
+
num_attention_heads (`int`): The number of heads to use for multi-head attention.
|
| 123 |
+
attention_head_dim (`int`): The number of channels in each head.
|
| 124 |
+
context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
|
| 125 |
+
processing of `context` conditions.
|
| 126 |
+
"""
|
| 127 |
+
|
| 128 |
+
def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
|
| 129 |
+
super().__init__()
|
| 130 |
+
self.mlp_hidden_dim = int(dim * mlp_ratio)
|
| 131 |
+
|
| 132 |
+
self.norm = AdaLayerNormZeroSingle(dim)
|
| 133 |
+
self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
|
| 134 |
+
self.act_mlp = nn.GELU(approximate="tanh")
|
| 135 |
+
self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
|
| 136 |
+
|
| 137 |
+
processor = FluxAttnSharedProcessor2_0()
|
| 138 |
+
|
| 139 |
+
self.attn = Attention(
|
| 140 |
+
query_dim=dim,
|
| 141 |
+
cross_attention_dim=None,
|
| 142 |
+
dim_head=attention_head_dim,
|
| 143 |
+
heads=num_attention_heads,
|
| 144 |
+
out_dim=dim,
|
| 145 |
+
bias=True,
|
| 146 |
+
processor=processor,
|
| 147 |
+
qk_norm="rms_norm",
|
| 148 |
+
eps=1e-6,
|
| 149 |
+
pre_only=True,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
def forward(
|
| 153 |
+
self,
|
| 154 |
+
hidden_states: torch.FloatTensor,
|
| 155 |
+
temb: torch.FloatTensor,
|
| 156 |
+
image_rotary_emb=None,
|
| 157 |
+
data_num_per_group=1,
|
| 158 |
+
max_sequence_length=512,
|
| 159 |
+
mix_attention: bool = True,
|
| 160 |
+
cond_temb = None,
|
| 161 |
+
cond_image_rotary_emb = None,
|
| 162 |
+
cond_latents = None,
|
| 163 |
+
joint_attention_kwargs=None,
|
| 164 |
+
|
| 165 |
+
):
|
| 166 |
+
with_cond = cond_latents is not None and mix_attention
|
| 167 |
+
|
| 168 |
+
residual = hidden_states
|
| 169 |
+
norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
|
| 170 |
+
mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
|
| 171 |
+
|
| 172 |
+
if with_cond:
|
| 173 |
+
residual_cond = cond_latents
|
| 174 |
+
norm_cond_latents, cond_gate = self.norm(cond_latents, emb=cond_temb)
|
| 175 |
+
mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_cond_latents))
|
| 176 |
+
|
| 177 |
+
joint_attention_kwargs = joint_attention_kwargs or {}
|
| 178 |
+
attn_output = self.attn(
|
| 179 |
+
hidden_states=norm_hidden_states,
|
| 180 |
+
image_rotary_emb=image_rotary_emb,
|
| 181 |
+
data_num_per_group=data_num_per_group,
|
| 182 |
+
max_sequence_length=max_sequence_length,
|
| 183 |
+
mix_attention=mix_attention,
|
| 184 |
+
cond_latents=norm_cond_latents if with_cond else None,
|
| 185 |
+
cond_image_rotary_emb=cond_image_rotary_emb if with_cond else None,
|
| 186 |
+
**joint_attention_kwargs,
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
if with_cond:
|
| 190 |
+
attn_output, cond_attn_output = attn_output
|
| 191 |
+
|
| 192 |
+
hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
|
| 193 |
+
gate = gate.unsqueeze(1)
|
| 194 |
+
hidden_states = gate * self.proj_out(hidden_states)
|
| 195 |
+
hidden_states = residual + hidden_states
|
| 196 |
+
|
| 197 |
+
if with_cond:
|
| 198 |
+
cond_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
|
| 199 |
+
cond_gate = cond_gate.unsqueeze(1)
|
| 200 |
+
cond_latents = cond_gate * self.proj_out(cond_latents)
|
| 201 |
+
cond_latents = residual_cond + cond_latents
|
| 202 |
+
|
| 203 |
+
if hidden_states.dtype == torch.float16:
|
| 204 |
+
hidden_states = hidden_states.clip(-65504, 65504)
|
| 205 |
+
|
| 206 |
+
if with_cond:
|
| 207 |
+
return hidden_states, cond_latents
|
| 208 |
+
else:
|
| 209 |
+
return hidden_states
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
@maybe_allow_in_graph
|
| 213 |
+
class FluxTransformerBlock(nn.Module):
|
| 214 |
+
r"""
|
| 215 |
+
A Transformer block following the MMDiT architecture, introduced in Stable Diffusion 3.
|
| 216 |
+
|
| 217 |
+
Reference: https://arxiv.org/abs/2403.03206
|
| 218 |
+
|
| 219 |
+
Parameters:
|
| 220 |
+
dim (`int`): The number of channels in the input and output.
|
| 221 |
+
num_attention_heads (`int`): The number of heads to use for multi-head attention.
|
| 222 |
+
attention_head_dim (`int`): The number of channels in each head.
|
| 223 |
+
context_pre_only (`bool`): Boolean to determine if we should add some blocks associated with the
|
| 224 |
+
processing of `context` conditions.
|
| 225 |
+
"""
|
| 226 |
+
|
| 227 |
+
def __init__(self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6):
|
| 228 |
+
super().__init__()
|
| 229 |
+
|
| 230 |
+
self.norm1 = AdaLayerNormZero(dim)
|
| 231 |
+
|
| 232 |
+
self.norm1_context = AdaLayerNormZero(dim)
|
| 233 |
+
|
| 234 |
+
processor = FluxAttnSharedProcessor2_0()
|
| 235 |
+
|
| 236 |
+
self.attn = Attention(
|
| 237 |
+
query_dim=dim,
|
| 238 |
+
cross_attention_dim=None,
|
| 239 |
+
added_kv_proj_dim=dim,
|
| 240 |
+
dim_head=attention_head_dim,
|
| 241 |
+
heads=num_attention_heads,
|
| 242 |
+
out_dim=dim,
|
| 243 |
+
context_pre_only=False,
|
| 244 |
+
bias=True,
|
| 245 |
+
processor=processor,
|
| 246 |
+
qk_norm=qk_norm,
|
| 247 |
+
eps=eps,
|
| 248 |
+
)
|
| 249 |
+
|
| 250 |
+
self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
| 251 |
+
self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
|
| 252 |
+
|
| 253 |
+
self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
|
| 254 |
+
self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
|
| 255 |
+
|
| 256 |
+
# let chunk size default to None
|
| 257 |
+
self._chunk_size = None
|
| 258 |
+
self._chunk_dim = 0
|
| 259 |
+
|
| 260 |
+
def forward(
|
| 261 |
+
self,
|
| 262 |
+
hidden_states: torch.FloatTensor,
|
| 263 |
+
encoder_hidden_states: torch.FloatTensor,
|
| 264 |
+
temb: torch.FloatTensor,
|
| 265 |
+
image_rotary_emb=None,
|
| 266 |
+
data_num_per_group=1,
|
| 267 |
+
max_sequence_length=512,
|
| 268 |
+
mix_attention: bool = True,
|
| 269 |
+
cond_temb = None,
|
| 270 |
+
cond_image_rotary_emb = None,
|
| 271 |
+
cond_latents = None,
|
| 272 |
+
joint_attention_kwargs=None,
|
| 273 |
+
):
|
| 274 |
+
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
|
| 275 |
+
|
| 276 |
+
norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
|
| 277 |
+
encoder_hidden_states, emb=temb
|
| 278 |
+
)
|
| 279 |
+
joint_attention_kwargs = joint_attention_kwargs or {}
|
| 280 |
+
|
| 281 |
+
with_cond = cond_latents is not None and mix_attention
|
| 282 |
+
if with_cond:
|
| 283 |
+
norm_cond_latents, cond_gate_msa, cond_shift_mlp, cond_scale_mlp, cond_gate_mlp = self.norm1(cond_latents, emb=cond_temb)
|
| 284 |
+
|
| 285 |
+
# Attention.
|
| 286 |
+
attention_outputs = self.attn(
|
| 287 |
+
hidden_states=norm_hidden_states,
|
| 288 |
+
encoder_hidden_states=norm_encoder_hidden_states,
|
| 289 |
+
image_rotary_emb=image_rotary_emb,
|
| 290 |
+
data_num_per_group=data_num_per_group,
|
| 291 |
+
max_sequence_length=max_sequence_length,
|
| 292 |
+
mix_attention=mix_attention,
|
| 293 |
+
cond_latents=norm_cond_latents if with_cond else None,
|
| 294 |
+
cond_image_rotary_emb=cond_image_rotary_emb if with_cond else None,
|
| 295 |
+
**joint_attention_kwargs,
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
if len(attention_outputs) == 2:
|
| 299 |
+
attn_output, context_attn_output = attention_outputs
|
| 300 |
+
elif len(attention_outputs) == 3 and with_cond:
|
| 301 |
+
attn_output, context_attn_output, cond_attn_output = attention_outputs
|
| 302 |
+
elif len(attention_outputs) == 3:
|
| 303 |
+
attn_output, context_attn_output, ip_attn_output = attention_outputs
|
| 304 |
+
|
| 305 |
+
# Process attention outputs for the `hidden_states`.
|
| 306 |
+
attn_output = gate_msa.unsqueeze(1) * attn_output
|
| 307 |
+
hidden_states = hidden_states + attn_output
|
| 308 |
+
|
| 309 |
+
norm_hidden_states = self.norm2(hidden_states)
|
| 310 |
+
norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
|
| 311 |
+
|
| 312 |
+
ff_output = self.ff(norm_hidden_states)
|
| 313 |
+
ff_output = gate_mlp.unsqueeze(1) * ff_output
|
| 314 |
+
|
| 315 |
+
hidden_states = hidden_states + ff_output
|
| 316 |
+
if len(attention_outputs) == 3 and not with_cond:
|
| 317 |
+
hidden_states = hidden_states + ip_attn_output
|
| 318 |
+
|
| 319 |
+
if with_cond:
|
| 320 |
+
cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
|
| 321 |
+
cond_latents = cond_latents + cond_attn_output
|
| 322 |
+
|
| 323 |
+
norm_cond_latents = self.norm2(cond_latents)
|
| 324 |
+
norm_cond_latents = norm_cond_latents * (1 + cond_scale_mlp[:, None]) + cond_shift_mlp[:, None]
|
| 325 |
+
|
| 326 |
+
cond_ff_output = self.ff(norm_cond_latents)
|
| 327 |
+
cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
|
| 328 |
+
|
| 329 |
+
cond_latents = cond_latents + cond_ff_output
|
| 330 |
+
# Process attention outputs for the `encoder_hidden_states`.
|
| 331 |
+
|
| 332 |
+
context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
|
| 333 |
+
encoder_hidden_states = encoder_hidden_states + context_attn_output
|
| 334 |
+
|
| 335 |
+
norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
|
| 336 |
+
norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
|
| 337 |
+
|
| 338 |
+
context_ff_output = self.ff_context(norm_encoder_hidden_states)
|
| 339 |
+
encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
|
| 340 |
+
if encoder_hidden_states.dtype == torch.float16:
|
| 341 |
+
encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
|
| 342 |
+
|
| 343 |
+
if with_cond:
|
| 344 |
+
return encoder_hidden_states, hidden_states, cond_latents
|
| 345 |
+
else:
|
| 346 |
+
return encoder_hidden_states, hidden_states
|
| 347 |
+
|
| 348 |
+
|
| 349 |
+
class FluxTransformer2DModel(
|
| 350 |
+
ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin
|
| 351 |
+
):
|
| 352 |
+
"""
|
| 353 |
+
The Transformer model introduced in Flux.
|
| 354 |
+
|
| 355 |
+
Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
|
| 356 |
+
|
| 357 |
+
Parameters:
|
| 358 |
+
patch_size (`int`): Patch size to turn the input data into small patches.
|
| 359 |
+
in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
|
| 360 |
+
num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
|
| 361 |
+
num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
|
| 362 |
+
attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
|
| 363 |
+
num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
|
| 364 |
+
joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
|
| 365 |
+
pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
|
| 366 |
+
guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
|
| 367 |
+
"""
|
| 368 |
+
|
| 369 |
+
_supports_gradient_checkpointing = True
|
| 370 |
+
_no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
|
| 371 |
+
|
| 372 |
+
@register_to_config
|
| 373 |
+
def __init__(
|
| 374 |
+
self,
|
| 375 |
+
patch_size: int = 1,
|
| 376 |
+
in_channels: int = 64,
|
| 377 |
+
out_channels: Optional[int] = None,
|
| 378 |
+
num_layers: int = 19,
|
| 379 |
+
num_single_layers: int = 38,
|
| 380 |
+
attention_head_dim: int = 128,
|
| 381 |
+
num_attention_heads: int = 24,
|
| 382 |
+
joint_attention_dim: int = 4096,
|
| 383 |
+
pooled_projection_dim: int = 768,
|
| 384 |
+
guidance_embeds: bool = False,
|
| 385 |
+
axes_dims_rope: Tuple[int] = (16, 56, 56),
|
| 386 |
+
):
|
| 387 |
+
super().__init__()
|
| 388 |
+
self.out_channels = out_channels or in_channels
|
| 389 |
+
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
|
| 390 |
+
if getattr(self.config, "num_image_tag_embeddings", None) is not None:
|
| 391 |
+
self.image_tag_embeddings = nn.Embedding(self.config.num_image_tag_embeddings, self.inner_dim)
|
| 392 |
+
if getattr(self.config, "num_context_tag_embeddings", None) is not None:
|
| 393 |
+
self.context_tag_embeddings = nn.Embedding(self.config.num_context_tag_embeddings, self.inner_dim)
|
| 394 |
+
|
| 395 |
+
self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
|
| 396 |
+
|
| 397 |
+
text_time_guidance_cls = (
|
| 398 |
+
CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
|
| 399 |
+
)
|
| 400 |
+
self.time_text_embed = text_time_guidance_cls(
|
| 401 |
+
embedding_dim=self.inner_dim, pooled_projection_dim=self.config.pooled_projection_dim
|
| 402 |
+
)
|
| 403 |
+
|
| 404 |
+
self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
|
| 405 |
+
self.x_embedder = nn.Linear(self.config.in_channels, self.inner_dim)
|
| 406 |
+
|
| 407 |
+
self.transformer_blocks = nn.ModuleList(
|
| 408 |
+
[
|
| 409 |
+
FluxTransformerBlock(
|
| 410 |
+
dim=self.inner_dim,
|
| 411 |
+
num_attention_heads=self.config.num_attention_heads,
|
| 412 |
+
attention_head_dim=self.config.attention_head_dim,
|
| 413 |
+
)
|
| 414 |
+
for i in range(self.config.num_layers)
|
| 415 |
+
]
|
| 416 |
+
)
|
| 417 |
+
|
| 418 |
+
self.single_transformer_blocks = nn.ModuleList(
|
| 419 |
+
[
|
| 420 |
+
FluxSingleTransformerBlock(
|
| 421 |
+
dim=self.inner_dim,
|
| 422 |
+
num_attention_heads=self.config.num_attention_heads,
|
| 423 |
+
attention_head_dim=self.config.attention_head_dim,
|
| 424 |
+
)
|
| 425 |
+
for i in range(self.config.num_single_layers)
|
| 426 |
+
]
|
| 427 |
+
)
|
| 428 |
+
|
| 429 |
+
self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
|
| 430 |
+
self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
|
| 431 |
+
|
| 432 |
+
self.gradient_checkpointing = False
|
| 433 |
+
|
| 434 |
+
def set_tag_embeddings(self, num_image_tag_embeddings=0, num_context_tag_embeddings=0):
|
| 435 |
+
if num_image_tag_embeddings > 0:
|
| 436 |
+
self.config.num_image_tag_embeddings = num_image_tag_embeddings
|
| 437 |
+
self.image_tag_embeddings = zero_module(nn.Embedding(self.config.num_image_tag_embeddings, self.inner_dim))
|
| 438 |
+
if num_context_tag_embeddings > 0:
|
| 439 |
+
self.config.num_context_tag_embeddings = num_context_tag_embeddings
|
| 440 |
+
self.context_tag_embeddings = zero_module(nn.Embedding(self.config.num_context_tag_embeddings, self.inner_dim))
|
| 441 |
+
|
| 442 |
+
def set_mask_tokenizer(self, mask_in_chans, mask_out_chans, activation = nn.GELU):
|
| 443 |
+
self.mask_tokenizer = nn.Sequential(
|
| 444 |
+
nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
|
| 445 |
+
LayerNorm2d(mask_in_chans // 4),
|
| 446 |
+
activation(),
|
| 447 |
+
nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=3, padding=1),
|
| 448 |
+
LayerNorm2d(mask_in_chans),
|
| 449 |
+
activation(),
|
| 450 |
+
nn.Conv2d(mask_in_chans, mask_out_chans, kernel_size=1),
|
| 451 |
+
nn.AdaptiveAvgPool2d((16, 16))
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
self.mask_attn = CrossAttention(mask_out_chans, mask_out_chans)
|
| 455 |
+
|
| 456 |
+
def forward_mask_attn(self, mask_images, fg_images):
|
| 457 |
+
mask_images = self.mask_tokenizer(mask_images)
|
| 458 |
+
mask_images = mask_images.flatten(2).transpose(1, 2)
|
| 459 |
+
mask_images = self.mask_attn(mask_images, fg_images, attention_mask=None)
|
| 460 |
+
return mask_images
|
| 461 |
+
|
| 462 |
+
@property
|
| 463 |
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
|
| 464 |
+
def attn_processors(self) -> Dict[str, AttentionProcessor]:
|
| 465 |
+
r"""
|
| 466 |
+
Returns:
|
| 467 |
+
`dict` of attention processors: A dictionary containing all attention processors used in the model with
|
| 468 |
+
indexed by its weight name.
|
| 469 |
+
"""
|
| 470 |
+
# set recursively
|
| 471 |
+
processors = {}
|
| 472 |
+
|
| 473 |
+
def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
|
| 474 |
+
if hasattr(module, "get_processor"):
|
| 475 |
+
processors[f"{name}.processor"] = module.get_processor()
|
| 476 |
+
|
| 477 |
+
for sub_name, child in module.named_children():
|
| 478 |
+
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
|
| 479 |
+
|
| 480 |
+
return processors
|
| 481 |
+
|
| 482 |
+
for name, module in self.named_children():
|
| 483 |
+
fn_recursive_add_processors(name, module, processors)
|
| 484 |
+
|
| 485 |
+
return processors
|
| 486 |
+
|
| 487 |
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
|
| 488 |
+
def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
|
| 489 |
+
r"""
|
| 490 |
+
Sets the attention processor to use to compute attention.
|
| 491 |
+
|
| 492 |
+
Parameters:
|
| 493 |
+
processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
|
| 494 |
+
The instantiated processor class or a dictionary of processor classes that will be set as the processor
|
| 495 |
+
for **all** `Attention` layers.
|
| 496 |
+
|
| 497 |
+
If `processor` is a dict, the key needs to define the path to the corresponding cross attention
|
| 498 |
+
processor. This is strongly recommended when setting trainable attention processors.
|
| 499 |
+
|
| 500 |
+
"""
|
| 501 |
+
count = len(self.attn_processors.keys())
|
| 502 |
+
|
| 503 |
+
if isinstance(processor, dict) and len(processor) != count:
|
| 504 |
+
raise ValueError(
|
| 505 |
+
f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
|
| 506 |
+
f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
|
| 510 |
+
if hasattr(module, "set_processor"):
|
| 511 |
+
if not isinstance(processor, dict):
|
| 512 |
+
module.set_processor(processor)
|
| 513 |
+
else:
|
| 514 |
+
module.set_processor(processor.pop(f"{name}.processor"))
|
| 515 |
+
|
| 516 |
+
for sub_name, child in module.named_children():
|
| 517 |
+
fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
|
| 518 |
+
|
| 519 |
+
for name, module in self.named_children():
|
| 520 |
+
fn_recursive_attn_processor(name, module, processor)
|
| 521 |
+
|
| 522 |
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
|
| 523 |
+
def fuse_qkv_projections(self):
|
| 524 |
+
"""
|
| 525 |
+
Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
|
| 526 |
+
are fused. For cross-attention modules, key and value projection matrices are fused.
|
| 527 |
+
|
| 528 |
+
<Tip warning={true}>
|
| 529 |
+
|
| 530 |
+
This API is 🧪 experimental.
|
| 531 |
+
|
| 532 |
+
</Tip>
|
| 533 |
+
"""
|
| 534 |
+
self.original_attn_processors = None
|
| 535 |
+
|
| 536 |
+
for _, attn_processor in self.attn_processors.items():
|
| 537 |
+
if "Added" in str(attn_processor.__class__.__name__):
|
| 538 |
+
raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
|
| 539 |
+
|
| 540 |
+
self.original_attn_processors = self.attn_processors
|
| 541 |
+
|
| 542 |
+
for module in self.modules():
|
| 543 |
+
if isinstance(module, Attention):
|
| 544 |
+
module.fuse_projections(fuse=True)
|
| 545 |
+
|
| 546 |
+
self.set_attn_processor(FusedFluxAttnProcessor2_0())
|
| 547 |
+
|
| 548 |
+
# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
|
| 549 |
+
def unfuse_qkv_projections(self):
|
| 550 |
+
"""Disables the fused QKV projection if enabled.
|
| 551 |
+
|
| 552 |
+
<Tip warning={true}>
|
| 553 |
+
|
| 554 |
+
This API is 🧪 experimental.
|
| 555 |
+
|
| 556 |
+
</Tip>
|
| 557 |
+
|
| 558 |
+
"""
|
| 559 |
+
if self.original_attn_processors is not None:
|
| 560 |
+
self.set_attn_processor(self.original_attn_processors)
|
| 561 |
+
|
| 562 |
+
def _set_gradient_checkpointing(self, module, value=False):
|
| 563 |
+
if hasattr(module, "gradient_checkpointing"):
|
| 564 |
+
module.gradient_checkpointing = value
|
| 565 |
+
|
| 566 |
+
def _format_input(self):
|
| 567 |
+
pass
|
| 568 |
+
|
| 569 |
+
def _format_output(self):
|
| 570 |
+
pass
|
| 571 |
+
|
| 572 |
+
def forward(
|
| 573 |
+
self,
|
| 574 |
+
hidden_states: torch.Tensor,
|
| 575 |
+
encoder_hidden_states: torch.Tensor = None,
|
| 576 |
+
cond_input: dict = None,
|
| 577 |
+
pooled_projections: torch.Tensor = None,
|
| 578 |
+
timestep: torch.LongTensor = None,
|
| 579 |
+
img_ids: torch.Tensor = None,
|
| 580 |
+
txt_ids: torch.Tensor = None,
|
| 581 |
+
guidance: torch.Tensor = None,
|
| 582 |
+
joint_attention_kwargs: Optional[Dict[str, Any]] = None,
|
| 583 |
+
controlnet_block_samples=None,
|
| 584 |
+
controlnet_single_block_samples=None,
|
| 585 |
+
return_dict: bool = True,
|
| 586 |
+
controlnet_blocks_repeat: bool = False,
|
| 587 |
+
data_num_per_group: int = 1,
|
| 588 |
+
image_tags=None,
|
| 589 |
+
context_tags=None,
|
| 590 |
+
max_sequence_length: int = 512,
|
| 591 |
+
mix_attention_double=True,
|
| 592 |
+
mix_attention_single=True,
|
| 593 |
+
) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
|
| 594 |
+
"""
|
| 595 |
+
The [`FluxTransformer2DModel`] forward method.
|
| 596 |
+
|
| 597 |
+
Args:
|
| 598 |
+
hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
|
| 599 |
+
Input `hidden_states`.
|
| 600 |
+
encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
|
| 601 |
+
Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
|
| 602 |
+
pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
|
| 603 |
+
from the embeddings of input conditions.
|
| 604 |
+
timestep ( `torch.LongTensor`):
|
| 605 |
+
Used to indicate denoising step.
|
| 606 |
+
block_controlnet_hidden_states: (`list` of `torch.Tensor`):
|
| 607 |
+
A list of tensors that if specified are added to the residuals of transformer blocks.
|
| 608 |
+
joint_attention_kwargs (`dict`, *optional*):
|
| 609 |
+
A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
|
| 610 |
+
`self.processor` in
|
| 611 |
+
[diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
| 612 |
+
return_dict (`bool`, *optional*, defaults to `True`):
|
| 613 |
+
Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
|
| 614 |
+
tuple.
|
| 615 |
+
|
| 616 |
+
Returns:
|
| 617 |
+
If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
|
| 618 |
+
`tuple` where the first element is the sample tensor.
|
| 619 |
+
"""
|
| 620 |
+
if joint_attention_kwargs is not None:
|
| 621 |
+
joint_attention_kwargs = joint_attention_kwargs.copy()
|
| 622 |
+
lora_scale = joint_attention_kwargs.pop("scale", 1.0)
|
| 623 |
+
else:
|
| 624 |
+
lora_scale = 1.0
|
| 625 |
+
|
| 626 |
+
if USE_PEFT_BACKEND:
|
| 627 |
+
# weight the lora layers by setting `lora_scale` for each PEFT layer
|
| 628 |
+
scale_lora_layers(self, lora_scale)
|
| 629 |
+
else:
|
| 630 |
+
if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
|
| 631 |
+
logger.warning(
|
| 632 |
+
"Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
|
| 633 |
+
)
|
| 634 |
+
|
| 635 |
+
hidden_states = self.x_embedder(hidden_states)
|
| 636 |
+
|
| 637 |
+
mask_cond = None
|
| 638 |
+
mask_ids = None
|
| 639 |
+
if cond_input is not None:
|
| 640 |
+
cond_image_latents = cond_input["image_latents"]
|
| 641 |
+
cond_image_ids = cond_input["image_ids"]
|
| 642 |
+
cond_latents = self.x_embedder(cond_image_latents)
|
| 643 |
+
|
| 644 |
+
if joint_attention_kwargs is not None and "mask_cond" in joint_attention_kwargs:
|
| 645 |
+
mask_cond = joint_attention_kwargs.pop("mask_cond")
|
| 646 |
+
mask_ids = joint_attention_kwargs.pop("mask_ids")
|
| 647 |
+
if mask_cond is not None:
|
| 648 |
+
mask_cond = self.forward_mask_attn(mask_cond, cond_latents[:1])
|
| 649 |
+
# joint_attention_kwargs["mask_cond"] = mask_cond
|
| 650 |
+
# hidden_states = hidden_states + mask_cond
|
| 651 |
+
|
| 652 |
+
if image_tags is not None:
|
| 653 |
+
image_tag_embeddings = self.image_tag_embeddings(
|
| 654 |
+
torch.Tensor(
|
| 655 |
+
image_tags,
|
| 656 |
+
).to(device=hidden_states.device, dtype=torch.int64)
|
| 657 |
+
)
|
| 658 |
+
bsz = hidden_states.shape[0] // data_num_per_group
|
| 659 |
+
image_tag_embeddings = image_tag_embeddings.repeat_interleave(bsz, dim=0)
|
| 660 |
+
if cond_input is not None:
|
| 661 |
+
hidden_states = hidden_states + image_tag_embeddings[0]
|
| 662 |
+
cond_latents = cond_latents + image_tag_embeddings[1:].unsqueeze(1)
|
| 663 |
+
else:
|
| 664 |
+
# for debug
|
| 665 |
+
if len(hidden_states) != len(image_tag_embeddings):
|
| 666 |
+
hidden_states += image_tag_embeddings[:1].unsqueeze(1)
|
| 667 |
+
else:
|
| 668 |
+
hidden_states = hidden_states + image_tag_embeddings.unsqueeze(1)
|
| 669 |
+
|
| 670 |
+
timestep = timestep.to(hidden_states.dtype) * 1000
|
| 671 |
+
if guidance is not None:
|
| 672 |
+
guidance = guidance.to(hidden_states.dtype) * 1000
|
| 673 |
+
else:
|
| 674 |
+
guidance = None
|
| 675 |
+
|
| 676 |
+
temb = (
|
| 677 |
+
self.time_text_embed(timestep, pooled_projections)
|
| 678 |
+
if guidance is None
|
| 679 |
+
else self.time_text_embed(timestep, guidance, pooled_projections)
|
| 680 |
+
)
|
| 681 |
+
if cond_input is not None:
|
| 682 |
+
cond_time = 0
|
| 683 |
+
cond_temb = ( self.time_text_embed(torch.ones_like(timestep)*cond_time, pooled_projections)
|
| 684 |
+
if guidance is None
|
| 685 |
+
else self.time_text_embed(torch.ones_like(timestep)*cond_time, guidance, pooled_projections)
|
| 686 |
+
)
|
| 687 |
+
encoder_hidden_states = self.context_embedder(encoder_hidden_states)
|
| 688 |
+
|
| 689 |
+
if context_tags is not None:
|
| 690 |
+
context_tag_embeddings = self.context_tag_embeddings(
|
| 691 |
+
torch.Tensor(
|
| 692 |
+
image_tags,
|
| 693 |
+
).to(device=hidden_states.device, dtype=torch.int64)
|
| 694 |
+
)
|
| 695 |
+
bsz = hidden_states.shape[0] // data_num_per_group
|
| 696 |
+
context_tag_embeddings = context_tag_embeddings.repeat_interleave(bsz, dim=0)
|
| 697 |
+
if cond_input is not None:
|
| 698 |
+
encoder_hidden_states = encoder_hidden_states + context_tag_embeddings[0]
|
| 699 |
+
else:
|
| 700 |
+
if len(encoder_hidden_states) != len(context_tag_embeddings):
|
| 701 |
+
encoder_hidden_states += context_tag_embeddings[:1].unsqueeze(1)
|
| 702 |
+
else:
|
| 703 |
+
encoder_hidden_states = encoder_hidden_states + context_tag_embeddings.unsqueeze(1)
|
| 704 |
+
|
| 705 |
+
if mask_cond is not None:
|
| 706 |
+
encoder_hidden_states = torch.cat([encoder_hidden_states, mask_cond], dim=1) # todo: compare with add
|
| 707 |
+
max_sequence_length = encoder_hidden_states.shape[1]
|
| 708 |
+
|
| 709 |
+
txt_ids = torch.cat((txt_ids, mask_ids), dim=0)
|
| 710 |
+
|
| 711 |
+
if isinstance(img_ids, list):
|
| 712 |
+
image_rotary_emb = []
|
| 713 |
+
for img_ids_ in img_ids:
|
| 714 |
+
ids = torch.cat((txt_ids, img_ids_), dim=0)
|
| 715 |
+
image_rotary_emb.append(self.pos_embed(ids))
|
| 716 |
+
image_rotary_emb = ( # to batch, cos / sin
|
| 717 |
+
torch.stack([_[0] for _ in image_rotary_emb]).repeat_interleave(hidden_states.shape[0] // len(img_ids), dim=0).clone(),
|
| 718 |
+
torch.stack([_[1] for _ in image_rotary_emb]).repeat_interleave(hidden_states.shape[0] // len(img_ids), dim=0).clone(),
|
| 719 |
+
)
|
| 720 |
+
else:
|
| 721 |
+
ids = torch.cat((txt_ids, img_ids), dim=0)
|
| 722 |
+
image_rotary_emb = self.pos_embed(ids)
|
| 723 |
+
if cond_input is not None:
|
| 724 |
+
cond_rotary_emb = []
|
| 725 |
+
for image_ids in cond_image_ids:
|
| 726 |
+
cond_rotary_emb.append(self.pos_embed(image_ids))
|
| 727 |
+
cond_rotary_emb = (
|
| 728 |
+
torch.stack([_[0] for _ in cond_rotary_emb]).repeat_interleave(cond_latents.shape[0] // len(cond_image_ids), dim=0).clone(),
|
| 729 |
+
torch.stack([_[1] for _ in cond_rotary_emb]).repeat_interleave(cond_latents.shape[0] // len(cond_image_ids), dim=0).clone(),
|
| 730 |
+
)
|
| 731 |
+
|
| 732 |
+
if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
|
| 733 |
+
ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
|
| 734 |
+
ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
|
| 735 |
+
joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
|
| 736 |
+
|
| 737 |
+
for index_block, block in enumerate(self.transformer_blocks):
|
| 738 |
+
if torch.is_grad_enabled() and self.gradient_checkpointing:
|
| 739 |
+
|
| 740 |
+
def create_custom_forward(module, return_dict=None):
|
| 741 |
+
def custom_forward(*inputs):
|
| 742 |
+
if return_dict is not None:
|
| 743 |
+
return module(*inputs, return_dict=return_dict)
|
| 744 |
+
else:
|
| 745 |
+
return module(*inputs)
|
| 746 |
+
|
| 747 |
+
return custom_forward
|
| 748 |
+
|
| 749 |
+
ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
|
| 750 |
+
# ckpt_kwargs.updata(joint_attention_kwargs)
|
| 751 |
+
block_output = torch.utils.checkpoint.checkpoint(
|
| 752 |
+
create_custom_forward(block),
|
| 753 |
+
hidden_states,
|
| 754 |
+
encoder_hidden_states,
|
| 755 |
+
temb,
|
| 756 |
+
image_rotary_emb,
|
| 757 |
+
data_num_per_group,
|
| 758 |
+
max_sequence_length,
|
| 759 |
+
mix_attention_double,
|
| 760 |
+
cond_temb if cond_input is not None else None,
|
| 761 |
+
cond_rotary_emb if cond_input is not None else None,
|
| 762 |
+
cond_latents if cond_input is not None else None,
|
| 763 |
+
joint_attention_kwargs,
|
| 764 |
+
**ckpt_kwargs,
|
| 765 |
+
)
|
| 766 |
+
else:
|
| 767 |
+
block_output = block(
|
| 768 |
+
hidden_states=hidden_states,
|
| 769 |
+
encoder_hidden_states=encoder_hidden_states,
|
| 770 |
+
temb=temb,
|
| 771 |
+
image_rotary_emb=image_rotary_emb,
|
| 772 |
+
data_num_per_group=data_num_per_group,
|
| 773 |
+
max_sequence_length=max_sequence_length,
|
| 774 |
+
mix_attention=mix_attention_double,
|
| 775 |
+
cond_temb = cond_temb if cond_input is not None else None,
|
| 776 |
+
cond_image_rotary_emb = cond_rotary_emb if cond_input is not None else None,
|
| 777 |
+
cond_latents = cond_latents if cond_input is not None else None,
|
| 778 |
+
joint_attention_kwargs=joint_attention_kwargs,
|
| 779 |
+
)
|
| 780 |
+
|
| 781 |
+
if cond_input is not None and mix_attention_double:
|
| 782 |
+
encoder_hidden_states, hidden_states, cond_latents = block_output
|
| 783 |
+
else:
|
| 784 |
+
encoder_hidden_states, hidden_states = block_output
|
| 785 |
+
|
| 786 |
+
# controlnet residual
|
| 787 |
+
if controlnet_block_samples is not None:
|
| 788 |
+
interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
|
| 789 |
+
interval_control = int(np.ceil(interval_control))
|
| 790 |
+
# For Xlabs ControlNet.
|
| 791 |
+
if controlnet_blocks_repeat:
|
| 792 |
+
hidden_states = (
|
| 793 |
+
hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
|
| 794 |
+
)
|
| 795 |
+
else:
|
| 796 |
+
hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
|
| 797 |
+
|
| 798 |
+
hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
|
| 799 |
+
|
| 800 |
+
for index_block, block in enumerate(self.single_transformer_blocks):
|
| 801 |
+
if torch.is_grad_enabled() and self.gradient_checkpointing:
|
| 802 |
+
|
| 803 |
+
def create_custom_forward(module, return_dict=None):
|
| 804 |
+
def custom_forward(*inputs):
|
| 805 |
+
if return_dict is not None:
|
| 806 |
+
return module(*inputs, return_dict=return_dict)
|
| 807 |
+
else:
|
| 808 |
+
return module(*inputs)
|
| 809 |
+
|
| 810 |
+
return custom_forward
|
| 811 |
+
|
| 812 |
+
ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
|
| 813 |
+
hidden_states = torch.utils.checkpoint.checkpoint(
|
| 814 |
+
create_custom_forward(block),
|
| 815 |
+
hidden_states,
|
| 816 |
+
temb,
|
| 817 |
+
image_rotary_emb,
|
| 818 |
+
data_num_per_group,
|
| 819 |
+
max_sequence_length,
|
| 820 |
+
mix_attention_single,
|
| 821 |
+
cond_temb if cond_input is not None else None,
|
| 822 |
+
cond_rotary_emb if cond_input is not None else None,
|
| 823 |
+
cond_latents if cond_input is not None else None,
|
| 824 |
+
joint_attention_kwargs,
|
| 825 |
+
**ckpt_kwargs,
|
| 826 |
+
)
|
| 827 |
+
|
| 828 |
+
else:
|
| 829 |
+
hidden_states = block(
|
| 830 |
+
hidden_states=hidden_states,
|
| 831 |
+
temb=temb,
|
| 832 |
+
image_rotary_emb=image_rotary_emb,
|
| 833 |
+
data_num_per_group=data_num_per_group,
|
| 834 |
+
max_sequence_length=max_sequence_length,
|
| 835 |
+
mix_attention=mix_attention_single,
|
| 836 |
+
cond_temb = cond_temb if cond_input is not None else None,
|
| 837 |
+
cond_image_rotary_emb = cond_rotary_emb if cond_input is not None else None,
|
| 838 |
+
cond_latents = cond_latents if cond_input is not None else None,
|
| 839 |
+
joint_attention_kwargs=joint_attention_kwargs,
|
| 840 |
+
)
|
| 841 |
+
|
| 842 |
+
if cond_input is not None and mix_attention_single:
|
| 843 |
+
hidden_states, cond_latents = hidden_states
|
| 844 |
+
|
| 845 |
+
# controlnet residual
|
| 846 |
+
if controlnet_single_block_samples is not None:
|
| 847 |
+
interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
|
| 848 |
+
interval_control = int(np.ceil(interval_control))
|
| 849 |
+
hidden_states[:, encoder_hidden_states.shape[1]:, ...] = (
|
| 850 |
+
hidden_states[:, encoder_hidden_states.shape[1]:, ...]
|
| 851 |
+
+ controlnet_single_block_samples[index_block // interval_control]
|
| 852 |
+
)
|
| 853 |
+
|
| 854 |
+
hidden_states = hidden_states[:, encoder_hidden_states.shape[1]:, ...]
|
| 855 |
+
|
| 856 |
+
hidden_states = self.norm_out(hidden_states, temb)
|
| 857 |
+
output = self.proj_out(hidden_states)
|
| 858 |
+
|
| 859 |
+
if USE_PEFT_BACKEND:
|
| 860 |
+
# remove `lora_scale` from each PEFT layer
|
| 861 |
+
unscale_lora_layers(self, lora_scale)
|
| 862 |
+
|
| 863 |
+
if not return_dict:
|
| 864 |
+
return (output,)
|
| 865 |
+
|
| 866 |
+
return Transformer2DModelOutput(sample=output)
|
dreamfuse/trains/utils/__pycache__/inference_utils.cpython-310.pyc
ADDED
|
Binary file (8.68 kB). View file
|
|
|
dreamfuse/trains/utils/inference_utils.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from diffusers.utils.torch_utils import randn_tensor
|
| 3 |
+
import numpy as np
|
| 4 |
+
from einops import rearrange
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
|
| 7 |
+
def get_mask_affine(mask1, mask2):
|
| 8 |
+
box1 = mask1.getbbox()
|
| 9 |
+
box2 = mask2.getbbox()
|
| 10 |
+
|
| 11 |
+
if box1 is None or box2 is None:
|
| 12 |
+
affine_coeffs = [1, 0, 0, 0, 1, 0]
|
| 13 |
+
return affine_coeffs
|
| 14 |
+
|
| 15 |
+
left1, top1, right1, bottom1 = box1
|
| 16 |
+
left2, top2, right2, bottom2 = box2
|
| 17 |
+
|
| 18 |
+
w1, h1 = right1 - left1, bottom1 - top1
|
| 19 |
+
w2, h2 = right2 - left2, bottom2 - top2
|
| 20 |
+
|
| 21 |
+
scale_x = w1 / w2
|
| 22 |
+
scale_y = h1 / h2
|
| 23 |
+
|
| 24 |
+
tx = left1 - left2*scale_x
|
| 25 |
+
ty = top1 - top2*scale_y
|
| 26 |
+
|
| 27 |
+
affine_coeffs = [scale_x, 0, tx, 0, scale_y, ty]
|
| 28 |
+
return affine_coeffs
|
| 29 |
+
|
| 30 |
+
def tokenize_prompt(tokenizer, prompt, max_sequence_length):
|
| 31 |
+
text_inputs = tokenizer(
|
| 32 |
+
prompt,
|
| 33 |
+
padding="max_length",
|
| 34 |
+
max_length=max_sequence_length,
|
| 35 |
+
truncation=True,
|
| 36 |
+
return_length=False,
|
| 37 |
+
return_overflowing_tokens=False,
|
| 38 |
+
return_tensors="pt",
|
| 39 |
+
)
|
| 40 |
+
text_input_ids = text_inputs.input_ids
|
| 41 |
+
return text_input_ids
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _encode_prompt_with_t5(
|
| 45 |
+
text_encoder,
|
| 46 |
+
tokenizer,
|
| 47 |
+
max_sequence_length=512,
|
| 48 |
+
prompt=None,
|
| 49 |
+
num_images_per_prompt=1,
|
| 50 |
+
device=None,
|
| 51 |
+
text_input_ids=None,
|
| 52 |
+
):
|
| 53 |
+
prompt = [prompt] if isinstance(prompt, str) else prompt
|
| 54 |
+
batch_size = len(prompt)
|
| 55 |
+
|
| 56 |
+
if tokenizer is not None:
|
| 57 |
+
text_inputs = tokenizer(
|
| 58 |
+
prompt,
|
| 59 |
+
padding="max_length",
|
| 60 |
+
max_length=max_sequence_length,
|
| 61 |
+
truncation=True,
|
| 62 |
+
return_length=False,
|
| 63 |
+
return_overflowing_tokens=False,
|
| 64 |
+
return_tensors="pt",
|
| 65 |
+
)
|
| 66 |
+
text_input_ids = text_inputs.input_ids
|
| 67 |
+
else:
|
| 68 |
+
if text_input_ids is None:
|
| 69 |
+
raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
|
| 70 |
+
prompt_embeds = text_encoder(text_input_ids.to(device))[0]
|
| 71 |
+
|
| 72 |
+
dtype = text_encoder.dtype
|
| 73 |
+
prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
|
| 74 |
+
|
| 75 |
+
_, seq_len, _ = prompt_embeds.shape
|
| 76 |
+
|
| 77 |
+
# duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
|
| 78 |
+
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
| 79 |
+
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
|
| 80 |
+
|
| 81 |
+
return prompt_embeds
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _encode_prompt_with_clip(
|
| 85 |
+
text_encoder,
|
| 86 |
+
tokenizer,
|
| 87 |
+
prompt: str,
|
| 88 |
+
device=None,
|
| 89 |
+
text_input_ids=None,
|
| 90 |
+
num_images_per_prompt: int = 1,
|
| 91 |
+
):
|
| 92 |
+
prompt = [prompt] if isinstance(prompt, str) else prompt
|
| 93 |
+
batch_size = len(prompt)
|
| 94 |
+
|
| 95 |
+
if tokenizer is not None:
|
| 96 |
+
text_inputs = tokenizer(
|
| 97 |
+
prompt,
|
| 98 |
+
padding="max_length",
|
| 99 |
+
max_length=77,
|
| 100 |
+
truncation=True,
|
| 101 |
+
return_overflowing_tokens=False,
|
| 102 |
+
return_length=False,
|
| 103 |
+
return_tensors="pt",
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
text_input_ids = text_inputs.input_ids
|
| 107 |
+
else:
|
| 108 |
+
if text_input_ids is None:
|
| 109 |
+
raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
|
| 110 |
+
|
| 111 |
+
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
|
| 112 |
+
|
| 113 |
+
# Use pooled output of CLIPTextModel
|
| 114 |
+
prompt_embeds = prompt_embeds.pooler_output
|
| 115 |
+
prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
|
| 116 |
+
|
| 117 |
+
# duplicate text embeddings for each generation per prompt, using mps friendly method
|
| 118 |
+
prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
|
| 119 |
+
prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
|
| 120 |
+
|
| 121 |
+
return prompt_embeds
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def compute_text_embeddings(config, prompt, text_encoders, tokenizers, device):
|
| 125 |
+
with torch.no_grad():
|
| 126 |
+
prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(
|
| 127 |
+
text_encoders, tokenizers, prompt, config.max_sequence_length
|
| 128 |
+
)
|
| 129 |
+
prompt_embeds = prompt_embeds.to(device)
|
| 130 |
+
pooled_prompt_embeds = pooled_prompt_embeds.to(device)
|
| 131 |
+
text_ids = text_ids.to(device)
|
| 132 |
+
return prompt_embeds, pooled_prompt_embeds, text_ids
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def _prepare_image_ids(height, width, offset_h=0, offset_w=0):
|
| 136 |
+
image_ids = torch.zeros(height, width, 3)
|
| 137 |
+
image_ids[..., 1] = image_ids[..., 1] + torch.arange(height)[:, None] + offset_h
|
| 138 |
+
image_ids[..., 2] = image_ids[..., 2] + torch.arange(width)[None, :] + offset_w
|
| 139 |
+
image_ids = image_ids.reshape(-1, 3)
|
| 140 |
+
return image_ids
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _pack_latents(latents, batch_size, num_channels_latents, height, width):
|
| 144 |
+
latents = latents.view(
|
| 145 |
+
batch_size, num_channels_latents, height // 2, 2, width // 2, 2
|
| 146 |
+
)
|
| 147 |
+
latents = latents.permute(0, 2, 4, 1, 3, 5)
|
| 148 |
+
latents = latents.reshape(
|
| 149 |
+
batch_size, (height // 2) * (width // 2), num_channels_latents * 4
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
return latents
|
| 153 |
+
|
| 154 |
+
def _unpack_latents(latents, height, width, vae_downsample_factor):
|
| 155 |
+
batch_size, num_patches, channels = latents.shape
|
| 156 |
+
|
| 157 |
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
| 158 |
+
# latent height and width to be divisible by 2.
|
| 159 |
+
height = 2 * (int(height) // (vae_downsample_factor * 2))
|
| 160 |
+
width = 2 * (int(width) // (vae_downsample_factor * 2))
|
| 161 |
+
|
| 162 |
+
latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
|
| 163 |
+
latents = latents.permute(0, 3, 1, 4, 2, 5)
|
| 164 |
+
|
| 165 |
+
latents = latents.reshape(batch_size, channels // (2 * 2), height, width)
|
| 166 |
+
|
| 167 |
+
return latents
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _prepare_latent_image_ids(batch_size, height, width, device, dtype, offset_h=0, offset_w=0):
|
| 171 |
+
latent_image_ids = torch.zeros(height, width, 3)
|
| 172 |
+
latent_image_ids[..., 1] = (
|
| 173 |
+
latent_image_ids[..., 1] + torch.arange(height)[:, None] + offset_h
|
| 174 |
+
)
|
| 175 |
+
latent_image_ids[..., 2] = (
|
| 176 |
+
latent_image_ids[..., 2] + torch.arange(width)[None, :] + offset_w
|
| 177 |
+
)
|
| 178 |
+
|
| 179 |
+
latent_image_id_height, latent_image_id_width, latent_image_id_channels = (
|
| 180 |
+
latent_image_ids.shape
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
latent_image_ids = latent_image_ids.reshape(
|
| 184 |
+
latent_image_id_height * latent_image_id_width, latent_image_id_channels
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
return latent_image_ids.to(device=device, dtype=dtype)
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def pil_to_tensor(image, device="cpu"):
|
| 191 |
+
image = np.array(image)
|
| 192 |
+
image = torch.from_numpy(image).float() / 127.5 - 1.0
|
| 193 |
+
image = image.permute(2, 0, 1).to(device)
|
| 194 |
+
return image
|
| 195 |
+
|
| 196 |
+
@torch.no_grad()
|
| 197 |
+
def encode_images_cond(vae_model, condition_images, device):
|
| 198 |
+
condition_image_tensors = []
|
| 199 |
+
for condition_image in condition_images:
|
| 200 |
+
condition_image_tensor = torch.tensor(np.array(condition_image)).to(device).permute(0, 3, 1, 2) # shape: [n_cond, c, h, w]
|
| 201 |
+
condition_image_tensor = condition_image_tensor / 127.5 - 1.0
|
| 202 |
+
condition_image_tensors.append(condition_image_tensor)
|
| 203 |
+
condition_image_tensors = torch.stack(condition_image_tensors) # shape: [bs, n_cond, c, h, w]
|
| 204 |
+
condition_image_tensors = rearrange(condition_image_tensors, 'b n c h w -> (b n) c h w')
|
| 205 |
+
|
| 206 |
+
# encode condition images
|
| 207 |
+
condition_image_latents = (
|
| 208 |
+
vae_model.encode(
|
| 209 |
+
condition_image_tensors.to(vae_model.dtype)
|
| 210 |
+
).latent_dist.sample()
|
| 211 |
+
) # shape: [bs*n_cond, c, h // 8, w // 8]
|
| 212 |
+
condition_image_latents = (condition_image_latents - vae_model.config.shift_factor) * vae_model.config.scaling_factor
|
| 213 |
+
|
| 214 |
+
return condition_image_latents
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
def prepare_latents(
|
| 218 |
+
batch_size,
|
| 219 |
+
num_channels_latents,
|
| 220 |
+
vae_downsample_factor,
|
| 221 |
+
height,
|
| 222 |
+
width,
|
| 223 |
+
dtype,
|
| 224 |
+
device,
|
| 225 |
+
generator,
|
| 226 |
+
latents=None,
|
| 227 |
+
offset=None,
|
| 228 |
+
hw=False,
|
| 229 |
+
):
|
| 230 |
+
# VAE applies 8x compression on images but we must also account for packing which requires
|
| 231 |
+
# latent height and width to be divisible by 2.
|
| 232 |
+
height = 2 * (int(height) // (vae_downsample_factor * 2))
|
| 233 |
+
width = 2 * (int(width) // (vae_downsample_factor * 2))
|
| 234 |
+
|
| 235 |
+
shape = (batch_size, num_channels_latents, height, width)
|
| 236 |
+
|
| 237 |
+
if latents is not None:
|
| 238 |
+
if offset is None:
|
| 239 |
+
latent_image_ids = _prepare_latent_image_ids(
|
| 240 |
+
batch_size, height // 2, width // 2, device, dtype
|
| 241 |
+
)
|
| 242 |
+
else:
|
| 243 |
+
latent_image_ids = []
|
| 244 |
+
for offset_ in offset:
|
| 245 |
+
latent_image_ids.append(
|
| 246 |
+
_prepare_latent_image_ids(
|
| 247 |
+
batch_size, height // 2, width // 2, device, dtype, offset_w=offset_ * width // 2, offset_h=offset_ * height // 2 if hw else 0
|
| 248 |
+
)
|
| 249 |
+
)
|
| 250 |
+
return latents.to(device=device, dtype=dtype), latent_image_ids
|
| 251 |
+
|
| 252 |
+
if isinstance(generator, list) and len(generator) != batch_size:
|
| 253 |
+
raise ValueError(
|
| 254 |
+
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
| 255 |
+
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
| 256 |
+
)
|
| 257 |
+
|
| 258 |
+
latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
| 259 |
+
latents = _pack_latents(
|
| 260 |
+
latents, batch_size, num_channels_latents, height, width
|
| 261 |
+
)
|
| 262 |
+
if offset is None:
|
| 263 |
+
latent_image_ids = _prepare_latent_image_ids(
|
| 264 |
+
batch_size, height // 2, width // 2, device, dtype
|
| 265 |
+
)
|
| 266 |
+
else:
|
| 267 |
+
latent_image_ids = []
|
| 268 |
+
for offset_ in offset:
|
| 269 |
+
latent_image_ids.append(
|
| 270 |
+
_prepare_latent_image_ids(
|
| 271 |
+
batch_size, height // 2, width // 2, device, dtype, offset_w=offset_ * width // 2, offset_h=offset_ * height // 2 if hw else 0
|
| 272 |
+
)
|
| 273 |
+
)
|
| 274 |
+
return latents, latent_image_ids
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
@torch.no_grad()
|
| 278 |
+
def encode_prompt(
|
| 279 |
+
text_encoders,
|
| 280 |
+
tokenizers,
|
| 281 |
+
prompt: str,
|
| 282 |
+
max_sequence_length,
|
| 283 |
+
device=None,
|
| 284 |
+
num_images_per_prompt: int = 1,
|
| 285 |
+
text_input_ids_list=None,
|
| 286 |
+
):
|
| 287 |
+
prompt = [prompt] if isinstance(prompt, str) else prompt
|
| 288 |
+
dtype = text_encoders[0].dtype
|
| 289 |
+
|
| 290 |
+
pooled_prompt_embeds = _encode_prompt_with_clip(
|
| 291 |
+
text_encoder=text_encoders[0],
|
| 292 |
+
tokenizer=tokenizers[0],
|
| 293 |
+
prompt=prompt,
|
| 294 |
+
device=device if device is not None else text_encoders[0].device,
|
| 295 |
+
num_images_per_prompt=num_images_per_prompt,
|
| 296 |
+
text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
prompt_embeds = _encode_prompt_with_t5(
|
| 300 |
+
text_encoder=text_encoders[1],
|
| 301 |
+
tokenizer=tokenizers[1],
|
| 302 |
+
max_sequence_length=max_sequence_length,
|
| 303 |
+
prompt=prompt,
|
| 304 |
+
num_images_per_prompt=num_images_per_prompt,
|
| 305 |
+
device=device if device is not None else text_encoders[1].device,
|
| 306 |
+
text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
|
| 310 |
+
|
| 311 |
+
return prompt_embeds, pooled_prompt_embeds, text_ids
|
| 312 |
+
|
| 313 |
+
def warp_affine_tensor(input_tensor, mask_affines, output_size, scale_factor=1/16,
|
| 314 |
+
align_corners_grid=False, align_corners_sample=True,
|
| 315 |
+
flatten_output=True, device=None):
|
| 316 |
+
"""
|
| 317 |
+
对输入的 tensor 应用 affine 仿射变换,并返回 warp 后的结果。
|
| 318 |
+
|
| 319 |
+
参数:
|
| 320 |
+
input_tensor: 待变换的图像 tensor,支持的形状包括 (H, W, C)、(C, H, W) 或 (1, C, H, W)。
|
| 321 |
+
mask_affines: 仿射参数(例如 [a, 0, tₓ, 0, e, t_y]),这些参数单位基于 512×512 图像。
|
| 322 |
+
output_size: 目标输出的空间尺寸,格式为 (H_out, W_out)。
|
| 323 |
+
scale_factor: 平移参数的缩放因子;例如若 512→32,则 factor = 32/512 = 1/16。
|
| 324 |
+
align_corners_grid: 传递给 F.affine_grid 的 align_corners 参数。
|
| 325 |
+
align_corners_sample: 传递给 F.grid_sample 的 align_corners 参数。
|
| 326 |
+
flatten_output: 若为 True,则将输出 warp 后的 tensor 从 (1, C, H_out, W_out) 转换为 (-1, C)。
|
| 327 |
+
device: 如果设置,将将相关 tensor 移动到指定的设备上。
|
| 328 |
+
|
| 329 |
+
返回:
|
| 330 |
+
warped_output: 经过 affine warp 处理后的 tensor,
|
| 331 |
+
若 flatten_output 为 True,则形状为 (H_out*W_out, C),否则为 (1, C, H_out, W_out)。
|
| 332 |
+
"""
|
| 333 |
+
# 如果输入 tensor 不是 batch(4D)的,则调整为 (1, C, H, W)
|
| 334 |
+
if input_tensor.dim() == 3:
|
| 335 |
+
# 判断是否为 (H, W, C),如果最后一维为 3,则认为是 RGB
|
| 336 |
+
if input_tensor.shape[-1] == 3:
|
| 337 |
+
input_tensor = input_tensor.permute(2, 0, 1)
|
| 338 |
+
input_tensor = input_tensor.unsqueeze(0)
|
| 339 |
+
elif input_tensor.dim() != 4:
|
| 340 |
+
raise ValueError("input_tensor 必须是 3D 或 4D Tensor!")
|
| 341 |
+
|
| 342 |
+
# 输出尺寸
|
| 343 |
+
H_out, W_out = output_size
|
| 344 |
+
B, C, H_in, W_in = input_tensor.shape
|
| 345 |
+
|
| 346 |
+
# 将 mask_affines 转换为 tensor,确保形状为 (1, 6)
|
| 347 |
+
if not torch.is_tensor(mask_affines):
|
| 348 |
+
theta = torch.tensor(mask_affines, dtype=torch.float32).unsqueeze(0)
|
| 349 |
+
else:
|
| 350 |
+
theta = mask_affines.clone().float()
|
| 351 |
+
if theta.dim() == 1:
|
| 352 |
+
theta = theta.unsqueeze(0)
|
| 353 |
+
|
| 354 |
+
# 调整平移部分(第三和第六个元素),使其适应当前目标分辨率
|
| 355 |
+
theta[0, 2] *= scale_factor # x 方向平移
|
| 356 |
+
theta[0, 5] *= scale_factor # y 方向平移
|
| 357 |
+
|
| 358 |
+
a = theta[0, 0]
|
| 359 |
+
t_x = theta[0, 2]
|
| 360 |
+
e = theta[0, 4]
|
| 361 |
+
t_y = theta[0, 5]
|
| 362 |
+
|
| 363 |
+
# 根据归一化转换(范围 [-1, 1])
|
| 364 |
+
# 对 x 方向:归一化公式为 x_norm = 2*x/(W_out-1) - 1
|
| 365 |
+
# 转换后 affine 的常数项即为:a + 2*t_x/(W_out-1) - 1
|
| 366 |
+
theta_norm = torch.tensor([
|
| 367 |
+
[a, 0.0, a + 2*t_x/(W_out - 1) - 1],
|
| 368 |
+
[0.0, e, e + 2*t_y/(H_out - 1) - 1]
|
| 369 |
+
], dtype=torch.float32).unsqueeze(0)
|
| 370 |
+
|
| 371 |
+
# 根据目标输出大小创建 affine_grid,grid 的 size 为 (B, C, H_out, W_out)
|
| 372 |
+
grid = F.affine_grid(theta_norm, size=(B, C, H_out, W_out), align_corners=align_corners_grid)
|
| 373 |
+
if device is not None:
|
| 374 |
+
grid = grid.to(device)
|
| 375 |
+
input_tensor = input_tensor.to(device)
|
| 376 |
+
|
| 377 |
+
# 对输入 tensor 进行采样
|
| 378 |
+
warped = F.grid_sample(input_tensor, grid, align_corners=align_corners_sample)
|
| 379 |
+
|
| 380 |
+
# 若需要将输出展平为 (-1, C)
|
| 381 |
+
if flatten_output:
|
| 382 |
+
# 将 (1, C, H_out, W_out) → 转为 (H_out, W_out, C) → reshape(-1, C)
|
| 383 |
+
warped = warped.squeeze(0).permute(1, 2, 0).reshape(-1, C)
|
| 384 |
+
return warped
|
| 385 |
+
|
| 386 |
+
|
dreamfuse_inference.py
ADDED
|
@@ -0,0 +1,642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gc
|
| 2 |
+
import os
|
| 3 |
+
from typing import List
|
| 4 |
+
import contextlib
|
| 5 |
+
import torch.multiprocessing as mp
|
| 6 |
+
from dataclasses import dataclass, field
|
| 7 |
+
from collections import defaultdict
|
| 8 |
+
import random
|
| 9 |
+
import numpy as np
|
| 10 |
+
from PIL import Image, ImageOps
|
| 11 |
+
import json
|
| 12 |
+
import torch
|
| 13 |
+
from peft import PeftModel
|
| 14 |
+
import torch.nn.functional as F
|
| 15 |
+
import accelerate
|
| 16 |
+
import diffusers
|
| 17 |
+
from diffusers import FluxPipeline
|
| 18 |
+
from diffusers.utils.torch_utils import is_compiled_module
|
| 19 |
+
import transformers
|
| 20 |
+
from tqdm import tqdm
|
| 21 |
+
from peft import LoraConfig, set_peft_model_state_dict
|
| 22 |
+
from peft.utils import get_peft_model_state_dict
|
| 23 |
+
from dreamfuse.models.dreamfuse_flux.transformer import (
|
| 24 |
+
FluxTransformer2DModel,
|
| 25 |
+
FluxTransformerBlock,
|
| 26 |
+
FluxSingleTransformerBlock,
|
| 27 |
+
)
|
| 28 |
+
from diffusers.schedulers.scheduling_flow_match_euler_discrete import (
|
| 29 |
+
FlowMatchEulerDiscreteScheduler,
|
| 30 |
+
)
|
| 31 |
+
from diffusers.pipelines.flux.pipeline_flux import calculate_shift, retrieve_timesteps
|
| 32 |
+
from dreamfuse.trains.utils.inference_utils import (
|
| 33 |
+
compute_text_embeddings,
|
| 34 |
+
prepare_latents,
|
| 35 |
+
_unpack_latents,
|
| 36 |
+
_pack_latents,
|
| 37 |
+
_prepare_image_ids,
|
| 38 |
+
encode_images_cond,
|
| 39 |
+
get_mask_affine,
|
| 40 |
+
warp_affine_tensor
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def seed_everything(seed):
|
| 45 |
+
torch.manual_seed(seed)
|
| 46 |
+
torch.cuda.manual_seed(seed)
|
| 47 |
+
random.seed(seed)
|
| 48 |
+
np.random.seed(seed)
|
| 49 |
+
|
| 50 |
+
@dataclass
|
| 51 |
+
class InferenceConfig:
|
| 52 |
+
# Model paths
|
| 53 |
+
flux_model_id: str = 'black-forest-labs/FLUX.1-dev'
|
| 54 |
+
|
| 55 |
+
lora_id: str = ''
|
| 56 |
+
model_choice: str = 'dev'
|
| 57 |
+
# Model configs
|
| 58 |
+
lora_rank: int = 16
|
| 59 |
+
max_sequence_length: int = 256
|
| 60 |
+
guidance_scale: float = 3.5
|
| 61 |
+
num_inference_steps: int = 28
|
| 62 |
+
mask_ids: int = 16
|
| 63 |
+
mask_in_chans: int = 128
|
| 64 |
+
mask_out_chans: int = 3072
|
| 65 |
+
inference_scale = 1024
|
| 66 |
+
|
| 67 |
+
# Training configs
|
| 68 |
+
gradient_checkpointing: bool = False
|
| 69 |
+
mix_attention_double: bool = True
|
| 70 |
+
mix_attention_single: bool = True
|
| 71 |
+
|
| 72 |
+
# Image processing
|
| 73 |
+
image_ids_offset: List[int] = field(default_factory=lambda: [0, 0, 0])
|
| 74 |
+
image_tags: List[int] = field(default_factory=lambda: [0, 1, 2])
|
| 75 |
+
context_tags: List[int] = None
|
| 76 |
+
|
| 77 |
+
# Runtime configs
|
| 78 |
+
device: str = "cuda:0" # if torch.cuda.is_available() else "cpu"
|
| 79 |
+
dtype: torch.dtype = torch.bfloat16
|
| 80 |
+
seed: int = 1234
|
| 81 |
+
debug: bool = True
|
| 82 |
+
|
| 83 |
+
# I/O configs
|
| 84 |
+
valid_output_dir: str = "./inference_output"
|
| 85 |
+
valid_roots: List[str] = field(default_factory=lambda: [
|
| 86 |
+
"./",
|
| 87 |
+
])
|
| 88 |
+
valid_jsons: List[str] = field(default_factory=lambda: [
|
| 89 |
+
"./examples/data_dreamfuse.json",
|
| 90 |
+
])
|
| 91 |
+
ref_prompts: str = ""
|
| 92 |
+
|
| 93 |
+
truecfg: bool = False
|
| 94 |
+
text_strength: int = 5
|
| 95 |
+
|
| 96 |
+
# multi gpu
|
| 97 |
+
sub_idx:int = 0
|
| 98 |
+
total_num:int = 1
|
| 99 |
+
|
| 100 |
+
def adjust_fg_to_bg(image: Image.Image, mask: Image.Image, target_size: tuple) -> tuple[Image.Image, Image.Image]:
|
| 101 |
+
width, height = image.size
|
| 102 |
+
target_w, target_h = target_size
|
| 103 |
+
|
| 104 |
+
scale = min(target_w / width, target_h / height)
|
| 105 |
+
if scale < 1:
|
| 106 |
+
new_w = int(width * scale)
|
| 107 |
+
new_h = int(height * scale)
|
| 108 |
+
image = image.resize((new_w, new_h))
|
| 109 |
+
mask = mask.resize((new_w, new_h))
|
| 110 |
+
width, height = new_w, new_h
|
| 111 |
+
|
| 112 |
+
pad_w = target_w - width
|
| 113 |
+
pad_h = target_h - height
|
| 114 |
+
padding = (
|
| 115 |
+
pad_w // 2, # left
|
| 116 |
+
pad_h // 2, # top
|
| 117 |
+
(pad_w + 1) // 2, # right
|
| 118 |
+
(pad_h + 1) // 2 # bottom
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
image = ImageOps.expand(image, border=padding, fill=(255, 255, 255))
|
| 122 |
+
mask = ImageOps.expand(mask, border=padding, fill=0)
|
| 123 |
+
|
| 124 |
+
return image, mask
|
| 125 |
+
|
| 126 |
+
def find_nearest_bucket_size(input_width, input_height, mode="x64", bucket_size=1024):
|
| 127 |
+
"""
|
| 128 |
+
Finds the nearest bucket size for the given input size.
|
| 129 |
+
"""
|
| 130 |
+
buckets = {
|
| 131 |
+
512: [[ 256, 768 ], [ 320, 768 ], [ 320, 704 ], [ 384, 640 ], [ 448, 576 ], [ 512, 512 ], [ 576, 448 ], [ 640, 384 ], [ 704, 320 ], [ 768, 320 ], [ 768, 256 ]],
|
| 132 |
+
768: [[ 384, 1152 ], [ 480, 1152 ], [ 480, 1056 ], [ 576, 960 ], [ 672, 864 ], [ 768, 768 ], [ 864, 672 ], [ 960, 576 ], [ 1056, 480 ], [ 1152, 480 ], [ 1152, 384 ]],
|
| 133 |
+
1024: [[ 512, 1536 ], [ 640, 1536 ], [ 640, 1408 ], [ 768, 1280 ], [ 896, 1152 ], [ 1024, 1024 ], [ 1152, 896 ], [ 1280, 768 ], [ 1408, 640 ], [ 1536, 640 ], [ 1536, 512 ]]
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
buckets = buckets[bucket_size]
|
| 137 |
+
|
| 138 |
+
aspect_ratios = [w / h for (w, h) in buckets]
|
| 139 |
+
assert mode in ["x64", "x8"]
|
| 140 |
+
if mode == "x64":
|
| 141 |
+
asp = input_width / input_height
|
| 142 |
+
diff = [abs(ar - asp) for ar in aspect_ratios]
|
| 143 |
+
bucket_id = int(np.argmin(diff))
|
| 144 |
+
gen_width, gen_height = buckets[bucket_id]
|
| 145 |
+
elif mode == "x8":
|
| 146 |
+
max_pixels = 1024 * 1024
|
| 147 |
+
ratio = (max_pixels / (input_width * input_height)) ** (0.5)
|
| 148 |
+
gen_width, gen_height = round(input_width * ratio), round(input_height * ratio)
|
| 149 |
+
gen_width = gen_width - gen_width % 8
|
| 150 |
+
gen_height = gen_height - gen_height % 8
|
| 151 |
+
else:
|
| 152 |
+
raise NotImplementedError
|
| 153 |
+
return (gen_width, gen_height)
|
| 154 |
+
|
| 155 |
+
def make_image_grid(images, rows, cols, size=None):
|
| 156 |
+
assert len(images) == rows * cols
|
| 157 |
+
|
| 158 |
+
if size is not None:
|
| 159 |
+
images = [img.resize((size[0], size[1])) for img in images]
|
| 160 |
+
|
| 161 |
+
w, h = images[0].size
|
| 162 |
+
grid = Image.new("RGB", size=(cols * w, rows * h))
|
| 163 |
+
|
| 164 |
+
for i, img in enumerate(images):
|
| 165 |
+
grid.paste(img.convert("RGB"), box=(i % cols * w, i // cols * h))
|
| 166 |
+
return grid
|
| 167 |
+
|
| 168 |
+
class DreamFuseInference:
|
| 169 |
+
def __init__(self, config: InferenceConfig):
|
| 170 |
+
self.config = config
|
| 171 |
+
print(config.device)
|
| 172 |
+
self.device = torch.device(config.device)
|
| 173 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 174 |
+
seed_everything(config.seed)
|
| 175 |
+
self._init_models()
|
| 176 |
+
|
| 177 |
+
def _init_models(self):
|
| 178 |
+
# Initialize tokenizers
|
| 179 |
+
self.tokenizer_one = transformers.CLIPTokenizer.from_pretrained(
|
| 180 |
+
self.config.flux_model_id, subfolder="tokenizer"
|
| 181 |
+
)
|
| 182 |
+
self.tokenizer_two = transformers.T5TokenizerFast.from_pretrained(
|
| 183 |
+
self.config.flux_model_id, subfolder="tokenizer_2"
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# Initialize text encoders
|
| 187 |
+
self.text_encoder_one = transformers.CLIPTextModel.from_pretrained(
|
| 188 |
+
self.config.flux_model_id, subfolder="text_encoder"
|
| 189 |
+
).to(device=self.device, dtype=self.config.dtype)
|
| 190 |
+
self.text_encoder_two = transformers.T5EncoderModel.from_pretrained(
|
| 191 |
+
self.config.flux_model_id, subfolder="text_encoder_2"
|
| 192 |
+
).to(device=self.device, dtype=self.config.dtype)
|
| 193 |
+
|
| 194 |
+
# Initialize VAE
|
| 195 |
+
self.vae = diffusers.AutoencoderKL.from_pretrained(
|
| 196 |
+
self.config.flux_model_id, subfolder="vae"
|
| 197 |
+
).to(device=self.device, dtype=self.config.dtype)
|
| 198 |
+
|
| 199 |
+
# Initialize denoising model
|
| 200 |
+
self.denoise_model = FluxTransformer2DModel.from_pretrained(
|
| 201 |
+
self.config.flux_model_id, subfolder="transformer"
|
| 202 |
+
).to(device=self.device, dtype=self.config.dtype)
|
| 203 |
+
|
| 204 |
+
if self.config.image_tags is not None or self.config.context_tags is not None:
|
| 205 |
+
num_image_tag_embeddings = max(self.config.image_tags) + 1 if self.config.image_tags is not None else 0
|
| 206 |
+
num_context_tag_embeddings = max(self.config.context_tags) + 1 if self.config.context_tags is not None else 0
|
| 207 |
+
self.denoise_model.set_tag_embeddings(
|
| 208 |
+
num_image_tag_embeddings=num_image_tag_embeddings,
|
| 209 |
+
num_context_tag_embeddings=num_context_tag_embeddings,
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Add LoRA
|
| 213 |
+
self.denoise_model = PeftModel.from_pretrained(
|
| 214 |
+
self.denoise_model,
|
| 215 |
+
self.config.lora_id,
|
| 216 |
+
adapter_weights=[1.0],
|
| 217 |
+
device_map={"": self.device}
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
# Initialize scheduler
|
| 221 |
+
self.scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
|
| 222 |
+
self.config.flux_model_id, subfolder="scheduler"
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
# Set models to eval mode
|
| 226 |
+
for model in [self.text_encoder_one, self.text_encoder_two, self.vae, self.denoise_model]:
|
| 227 |
+
model.eval()
|
| 228 |
+
model.requires_grad_(False)
|
| 229 |
+
|
| 230 |
+
def _compute_text_embeddings(self, prompt):
|
| 231 |
+
return compute_text_embeddings(
|
| 232 |
+
self.config,
|
| 233 |
+
prompt,
|
| 234 |
+
[self.text_encoder_one, self.text_encoder_two],
|
| 235 |
+
[self.tokenizer_one, self.tokenizer_two],
|
| 236 |
+
self.device
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
def resize_to_fit_within(self, reference_image, target_image):
|
| 240 |
+
ref_width, ref_height = reference_image.size
|
| 241 |
+
target_width, target_height = target_image.size
|
| 242 |
+
|
| 243 |
+
scale_width = ref_width / target_width
|
| 244 |
+
scale_height = ref_height / target_height
|
| 245 |
+
scale = min(scale_width, scale_height) # 选择最小的缩放比例,确保不超出参考图片的宽高
|
| 246 |
+
|
| 247 |
+
new_width = int(target_width * scale)
|
| 248 |
+
new_height = int(target_height * scale)
|
| 249 |
+
|
| 250 |
+
resized_image = target_image.resize((new_width, new_height), Image.LANCZOS)
|
| 251 |
+
return resized_image
|
| 252 |
+
|
| 253 |
+
def pad_or_crop(self, img, target_size, fill_color=(255, 255, 255)):
|
| 254 |
+
"""
|
| 255 |
+
将输入图像按中心对齐,裁剪或填充到 target_size 大小。
|
| 256 |
+
|
| 257 |
+
参数:
|
| 258 |
+
img - PIL.Image 对象
|
| 259 |
+
target_size - 目标尺寸 (width, height)
|
| 260 |
+
fill_color - 填充颜色,默认为白色
|
| 261 |
+
|
| 262 |
+
返回:
|
| 263 |
+
调整后的 PIL.Image 对象,尺寸为 target_size
|
| 264 |
+
"""
|
| 265 |
+
iw, ih = img.size
|
| 266 |
+
tw, th = target_size
|
| 267 |
+
|
| 268 |
+
# 计算裁剪区域:若原图大于目标尺寸,则裁剪出中间部分;否则全部保留
|
| 269 |
+
left = (iw - tw) // 2 if iw >= tw else 0
|
| 270 |
+
top = (ih - th) // 2 if ih >= th else 0
|
| 271 |
+
cropped = img.crop((left, top, left + min(iw, tw), top + min(ih, th)))
|
| 272 |
+
|
| 273 |
+
# 新建目标尺寸的图像,并将裁剪后的图像居中粘贴
|
| 274 |
+
new_img = Image.new(img.mode, target_size, fill_color)
|
| 275 |
+
offset = ((tw - cropped.width) // 2, (th - cropped.height) // 2)
|
| 276 |
+
new_img.paste(cropped, offset)
|
| 277 |
+
|
| 278 |
+
return new_img
|
| 279 |
+
|
| 280 |
+
def transform_foreground_original(self, original_fg, original_bg, transformation_info, canvas_size=400):
|
| 281 |
+
"""
|
| 282 |
+
根据 transformation_info 中的信息对原始前景图(original_fg)进行平移处理,
|
| 283 |
+
要求:
|
| 284 |
+
1. 输出图像大小与 original_fg 相同(保持原始前景图大小);
|
| 285 |
+
2. 位移计算时,还原为未缩放的拖拽坐标,即用 drag_left/drag_top 除以 scale_ratio;
|
| 286 |
+
3. 拖拽产生的相对位移比例在 400x400 预览画布下相对于未缩放时默认(居中)位置计算,
|
| 287 |
+
然后按此比例推算到原始前景图尺寸下的实际位移(像素数)。
|
| 288 |
+
4. 结果在原始前景图大小的白底(未覆盖区域填充白色)中粘贴前景图。
|
| 289 |
+
|
| 290 |
+
参数:
|
| 291 |
+
original_fg: 原始上传的前景图(PIL Image 对象)
|
| 292 |
+
transformation_info: 字典,必须包含以下字段:
|
| 293 |
+
- "drag_left": 拖拽后当前显示的前景图左上角横坐标(受缩放影响,单位像素)
|
| 294 |
+
- "drag_top": 拖拽后当前显示的前景图左上角纵坐标(受缩放影响,单位像素)
|
| 295 |
+
- "scale_ratio": 预览时前景图缩放比例
|
| 296 |
+
- "data_original_width": 前景图在预览中未缩放时的宽度
|
| 297 |
+
- "data_original_height": 前景图在预览中未缩放时的高度
|
| 298 |
+
canvas_size: 预览画布尺寸(默认400,与前端保持一致)
|
| 299 |
+
|
| 300 |
+
返回:
|
| 301 |
+
处理后的图像(PIL Image 对象),大小与 original_fg 相同,
|
| 302 |
+
并根据未缩放时拖拽的相对位移结果进行了平移。
|
| 303 |
+
"""
|
| 304 |
+
# 读取 transformation_info 中的参数
|
| 305 |
+
drag_left = float(transformation_info.get("drag_left", 0))
|
| 306 |
+
drag_top = float(transformation_info.get("drag_top", 0))
|
| 307 |
+
scale_ratio = float(transformation_info.get("scale_ratio", 1))
|
| 308 |
+
data_orig_width = float(transformation_info.get("data_original_width", canvas_size))
|
| 309 |
+
data_orig_height = float(transformation_info.get("data_original_height", canvas_size))
|
| 310 |
+
drag_width = float(transformation_info.get("drag_width", 0))
|
| 311 |
+
drag_height = float(transformation_info.get("drag_height", 0))
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
scale_ori_fg = canvas_size / max(original_fg.width, original_fg.height)
|
| 315 |
+
scale_ori_bg = canvas_size / max(original_bg.width, original_bg.height)
|
| 316 |
+
|
| 317 |
+
# 计算未缩放状态下(预览中)的默认居中位置(前景图未拖拽时的理想位置)
|
| 318 |
+
default_left = (canvas_size - data_orig_width) / 2.0
|
| 319 |
+
default_top = (canvas_size - data_orig_height) / 2.0
|
| 320 |
+
|
| 321 |
+
# 在未缩放状态下,计算实际拖拽产生的偏移(单位:像素,在预览尺寸下计算)
|
| 322 |
+
offset_preview_x = drag_left - default_left
|
| 323 |
+
offset_preview_y = drag_top - default_top
|
| 324 |
+
|
| 325 |
+
offset_ori_x = offset_preview_x / scale_ori_fg
|
| 326 |
+
offset_ori_y = offset_preview_y / scale_ori_fg
|
| 327 |
+
|
| 328 |
+
new_width = int(original_fg.width * scale_ratio)
|
| 329 |
+
new_height = int(original_fg.height * scale_ratio)
|
| 330 |
+
scale_fg = original_fg.resize((new_width, new_height))
|
| 331 |
+
|
| 332 |
+
output = Image.new("RGBA", (original_fg.width, original_fg.height), (255, 255, 255, 0))
|
| 333 |
+
output.paste(scale_fg, (int(offset_ori_x), int(offset_ori_y)))
|
| 334 |
+
|
| 335 |
+
new_width_fgbg = original_fg.width * scale_ori_fg / scale_ori_bg
|
| 336 |
+
new_height_fgbg = original_fg.height * scale_ori_fg / scale_ori_bg
|
| 337 |
+
scale_fgbg = output.resize((int(new_width_fgbg), int(new_height_fgbg)))
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
final_output = Image.new("RGBA", (original_bg.width, original_bg.height), (255, 255, 255, 0))
|
| 341 |
+
scale_fgbg = self.pad_or_crop(scale_fgbg, (original_bg.width, original_bg.height), (255, 255, 255, 0))
|
| 342 |
+
final_output.paste(scale_fgbg, (0, 0))
|
| 343 |
+
|
| 344 |
+
fit_fg = self.resize_to_fit_within(original_bg, original_fg)
|
| 345 |
+
fit_fg = self.pad_or_crop(fit_fg, original_bg.size, (255, 255, 255, 0))
|
| 346 |
+
|
| 347 |
+
return final_output, fit_fg
|
| 348 |
+
|
| 349 |
+
@torch.inference_mode()
|
| 350 |
+
def gradio_generate(self, background_img, foreground_img, transformation_info, seed, prompt, enable_gui, cfg=3.5, size_select="1024", text_strength=1, truecfg=False):
|
| 351 |
+
print("!"*10)
|
| 352 |
+
"""使用 DreamFuseInference 进行模型推理"""
|
| 353 |
+
try:
|
| 354 |
+
trans = json.loads(transformation_info)
|
| 355 |
+
except:
|
| 356 |
+
trans = {}
|
| 357 |
+
|
| 358 |
+
size_select = int(size_select)
|
| 359 |
+
|
| 360 |
+
# import pdb; pdb.set_trace()
|
| 361 |
+
r, g, b, ori_a = foreground_img.split()
|
| 362 |
+
fg_img_scale, fg_img = self.transform_foreground_original(foreground_img, background_img, trans)
|
| 363 |
+
|
| 364 |
+
new_r, new_g, new_b, new_a = fg_img_scale.split()
|
| 365 |
+
foreground_img_scale = Image.merge("RGB", (new_r, new_g, new_b))
|
| 366 |
+
|
| 367 |
+
r, g, b, ori_a = fg_img.split()
|
| 368 |
+
foreground_img = Image.merge("RGB", (r, g, b))
|
| 369 |
+
foreground_img_save = foreground_img.copy()
|
| 370 |
+
ori_a = ori_a.convert("L")
|
| 371 |
+
new_a = new_a.convert("L")
|
| 372 |
+
foreground_img.paste((255, 255, 255), mask=ImageOps.invert(ori_a))
|
| 373 |
+
print("0"*10)
|
| 374 |
+
print(foreground_img.size)
|
| 375 |
+
print(background_img.size)
|
| 376 |
+
images = self.model_generate(foreground_img.copy(), background_img.copy(),
|
| 377 |
+
ori_a, new_a,
|
| 378 |
+
enable_mask_affine=enable_gui,
|
| 379 |
+
prompt=prompt,
|
| 380 |
+
offset_cond=[0, 1, 0] if not enable_gui else None,
|
| 381 |
+
seed=seed,
|
| 382 |
+
cfg=cfg,
|
| 383 |
+
size_select=size_select,
|
| 384 |
+
text_strength=text_strength,
|
| 385 |
+
truecfg=truecfg)
|
| 386 |
+
images = Image.fromarray(images[0], "RGB")
|
| 387 |
+
|
| 388 |
+
images = images.resize(background_img.size)
|
| 389 |
+
images_save = images.copy()
|
| 390 |
+
|
| 391 |
+
images.thumbnail((640, 640), Image.LANCZOS)
|
| 392 |
+
return images
|
| 393 |
+
|
| 394 |
+
|
| 395 |
+
@torch.inference_mode()
|
| 396 |
+
def model_generate(self, fg_image, bg_image, ori_fg_mask, new_fg_mask, enable_mask_affine=True, prompt="", offset_cond=None, seed=None, cfg=3.5, size_select=1024, text_strength=1, truecfg=False):
|
| 397 |
+
batch_size = 1
|
| 398 |
+
print("-3"*10)
|
| 399 |
+
# Prepare images
|
| 400 |
+
# adjust bg->fg size
|
| 401 |
+
fg_image, ori_fg_mask = adjust_fg_to_bg(fg_image, ori_fg_mask, bg_image.size)
|
| 402 |
+
bucket_size = find_nearest_bucket_size(bg_image.size[0], bg_image.size[1], bucket_size=size_select)
|
| 403 |
+
|
| 404 |
+
fg_image = fg_image.resize(bucket_size)
|
| 405 |
+
bg_image = bg_image.resize(bucket_size)
|
| 406 |
+
|
| 407 |
+
mask_affine = None
|
| 408 |
+
if enable_mask_affine:
|
| 409 |
+
ori_fg_mask = ori_fg_mask.resize(bucket_size)
|
| 410 |
+
new_fg_mask = new_fg_mask.resize(bucket_size)
|
| 411 |
+
mask_affine = get_mask_affine(new_fg_mask, ori_fg_mask)
|
| 412 |
+
|
| 413 |
+
print("-2"*10)
|
| 414 |
+
# Get embeddings
|
| 415 |
+
prompt_embeds, pooled_prompt_embeds, text_ids = self._compute_text_embeddings(prompt)
|
| 416 |
+
|
| 417 |
+
prompt_embeds = prompt_embeds.repeat(1, text_strength, 1)
|
| 418 |
+
text_ids = text_ids.repeat(text_strength, 1)
|
| 419 |
+
|
| 420 |
+
# Prepare
|
| 421 |
+
if self.config.model_choice == "dev":
|
| 422 |
+
guidance = torch.full([1], cfg, device=self.device, dtype=torch.float32)
|
| 423 |
+
guidance = guidance.expand(batch_size)
|
| 424 |
+
else:
|
| 425 |
+
guidance = None
|
| 426 |
+
|
| 427 |
+
# Prepare generator
|
| 428 |
+
if seed is None:
|
| 429 |
+
seed = self.config.seed
|
| 430 |
+
generator = torch.Generator(device=self.device).manual_seed(seed)
|
| 431 |
+
print("-1"*10)
|
| 432 |
+
# Prepare condition latents
|
| 433 |
+
condition_image_latents = self._encode_images([fg_image, bg_image])
|
| 434 |
+
|
| 435 |
+
if offset_cond is None:
|
| 436 |
+
offset_cond = self.config.image_ids_offset
|
| 437 |
+
offset_cond = offset_cond[1:]
|
| 438 |
+
cond_latent_image_ids = []
|
| 439 |
+
for offset_ in offset_cond:
|
| 440 |
+
cond_latent_image_ids.append(
|
| 441 |
+
self._prepare_image_ids(
|
| 442 |
+
condition_image_latents.shape[2] // 2,
|
| 443 |
+
condition_image_latents.shape[3] // 2,
|
| 444 |
+
offset_w=offset_ * condition_image_latents.shape[3] // 2
|
| 445 |
+
)
|
| 446 |
+
)
|
| 447 |
+
|
| 448 |
+
print(1)
|
| 449 |
+
if mask_affine is not None:
|
| 450 |
+
affine_H, affine_W = condition_image_latents.shape[2] // 2, condition_image_latents.shape[3] // 2
|
| 451 |
+
scale_factor = 1 / 16
|
| 452 |
+
cond_latent_image_ids_fg = cond_latent_image_ids[0].reshape(affine_H, affine_W, 3).clone()
|
| 453 |
+
|
| 454 |
+
# opt 1
|
| 455 |
+
cond_latent_image_ids[0] = warp_affine_tensor(
|
| 456 |
+
cond_latent_image_ids_fg, mask_affine, output_size=(affine_H, affine_W),
|
| 457 |
+
scale_factor=scale_factor, device=self.device,
|
| 458 |
+
)
|
| 459 |
+
cond_latent_image_ids = torch.stack(cond_latent_image_ids)
|
| 460 |
+
print(2)
|
| 461 |
+
# Pack condition latents
|
| 462 |
+
cond_image_latents = self._pack_latents(condition_image_latents)
|
| 463 |
+
cond_input = {
|
| 464 |
+
"image_latents": cond_image_latents,
|
| 465 |
+
"image_ids": cond_latent_image_ids,
|
| 466 |
+
}
|
| 467 |
+
# Prepare initial latents
|
| 468 |
+
width, height = bucket_size
|
| 469 |
+
num_channels_latents = self.denoise_model.config.in_channels // 4
|
| 470 |
+
latents, latent_image_ids = self._prepare_latents(
|
| 471 |
+
batch_size, num_channels_latents, height, width, generator
|
| 472 |
+
)
|
| 473 |
+
print(3)
|
| 474 |
+
# Setup timesteps
|
| 475 |
+
sigmas = np.linspace(1.0, 1 / self.config.num_inference_steps, self.config.num_inference_steps)
|
| 476 |
+
image_seq_len = latents.shape[1]
|
| 477 |
+
mu = calculate_shift(
|
| 478 |
+
image_seq_len,
|
| 479 |
+
self.scheduler.config.base_image_seq_len,
|
| 480 |
+
self.scheduler.config.max_image_seq_len,
|
| 481 |
+
self.scheduler.config.base_shift,
|
| 482 |
+
self.scheduler.config.max_shift,
|
| 483 |
+
)
|
| 484 |
+
timesteps, num_inference_steps = retrieve_timesteps(
|
| 485 |
+
self.scheduler,
|
| 486 |
+
self.config.num_inference_steps,
|
| 487 |
+
self.device,
|
| 488 |
+
sigmas=sigmas,
|
| 489 |
+
mu=mu,
|
| 490 |
+
)
|
| 491 |
+
print(4)
|
| 492 |
+
# Denoising loop
|
| 493 |
+
for i, t in enumerate(timesteps):
|
| 494 |
+
timestep = t.expand(latents.shape[0]).to(latents.dtype)
|
| 495 |
+
with torch.autocast(enabled=True, device_type="cuda", dtype=self.config.dtype):
|
| 496 |
+
noise_pred = self.denoise_model(
|
| 497 |
+
hidden_states=latents,
|
| 498 |
+
cond_input=cond_input,
|
| 499 |
+
timestep=timestep / 1000,
|
| 500 |
+
guidance=guidance,
|
| 501 |
+
pooled_projections=pooled_prompt_embeds,
|
| 502 |
+
encoder_hidden_states=prompt_embeds,
|
| 503 |
+
txt_ids=text_ids,
|
| 504 |
+
img_ids=latent_image_ids,
|
| 505 |
+
data_num_per_group=batch_size,
|
| 506 |
+
image_tags=self.config.image_tags,
|
| 507 |
+
context_tags=self.config.context_tags,
|
| 508 |
+
max_sequence_length=self.config.max_sequence_length,
|
| 509 |
+
mix_attention_double=self.config.mix_attention_double,
|
| 510 |
+
mix_attention_single=self.config.mix_attention_single,
|
| 511 |
+
joint_attention_kwargs=None,
|
| 512 |
+
return_dict=False,
|
| 513 |
+
)[0]
|
| 514 |
+
|
| 515 |
+
if truecfg and i >= 1:
|
| 516 |
+
guidance_neg = torch.full([1], 1, device=self.device, dtype=torch.float32)
|
| 517 |
+
guidance_neg = guidance_neg.expand(batch_size)
|
| 518 |
+
noise_pred_neg = self.denoise_model(
|
| 519 |
+
hidden_states=latents,
|
| 520 |
+
cond_input=cond_input,
|
| 521 |
+
timestep=timestep / 1000,
|
| 522 |
+
guidance=guidance,
|
| 523 |
+
pooled_projections=pooled_prompt_embeds,
|
| 524 |
+
encoder_hidden_states=prompt_embeds,
|
| 525 |
+
txt_ids=text_ids,
|
| 526 |
+
img_ids=latent_image_ids,
|
| 527 |
+
data_num_per_group=batch_size,
|
| 528 |
+
image_tags=self.config.image_tags,
|
| 529 |
+
context_tags=self.config.context_tags,
|
| 530 |
+
max_sequence_length=self.config.max_sequence_length,
|
| 531 |
+
mix_attention_double=self.config.mix_attention_double,
|
| 532 |
+
mix_attention_single=self.config.mix_attention_single,
|
| 533 |
+
joint_attention_kwargs=None,
|
| 534 |
+
return_dict=False,
|
| 535 |
+
)[0]
|
| 536 |
+
noise_pred = noise_pred_neg + 5 * (noise_pred - noise_pred_neg)
|
| 537 |
+
|
| 538 |
+
# Compute previous noisy sample
|
| 539 |
+
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
| 540 |
+
print(5)
|
| 541 |
+
# Decode latents
|
| 542 |
+
latents = self._unpack_latents(latents, height, width)
|
| 543 |
+
latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
|
| 544 |
+
images = self.vae.decode(latents, return_dict=False)[0]
|
| 545 |
+
print(6)
|
| 546 |
+
# Post-process images
|
| 547 |
+
images = images.add(1).mul(127.5).clamp(0, 255).to(torch.uint8).permute(0, 2, 3, 1).cpu().numpy()
|
| 548 |
+
return images
|
| 549 |
+
|
| 550 |
+
def _encode_images(self, images):
|
| 551 |
+
return encode_images_cond(self.vae, [images], self.device)
|
| 552 |
+
|
| 553 |
+
def _prepare_image_ids(self, h, w, offset_w=0):
|
| 554 |
+
return _prepare_image_ids(h, w, offset_w=offset_w).to(self.device)
|
| 555 |
+
|
| 556 |
+
def _pack_latents(self, latents):
|
| 557 |
+
b, c, h, w = latents.shape
|
| 558 |
+
return _pack_latents(latents, b, c, h, w)
|
| 559 |
+
|
| 560 |
+
def _unpack_latents(self, latents, height, width):
|
| 561 |
+
vae_scale = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
| 562 |
+
return _unpack_latents(latents, height, width, vae_scale)
|
| 563 |
+
|
| 564 |
+
def _prepare_latents(self, batch_size, num_channels_latents, height, width, generator):
|
| 565 |
+
vae_scale = 2 ** (len(self.vae.config.block_out_channels) - 1)
|
| 566 |
+
latents, latent_image_ids = prepare_latents(
|
| 567 |
+
batch_size=batch_size,
|
| 568 |
+
num_channels_latents=num_channels_latents,
|
| 569 |
+
vae_downsample_factor=vae_scale,
|
| 570 |
+
height=height,
|
| 571 |
+
width=width,
|
| 572 |
+
dtype=self.config.dtype,
|
| 573 |
+
device=self.device,
|
| 574 |
+
generator=generator,
|
| 575 |
+
offset=None
|
| 576 |
+
)
|
| 577 |
+
return latents, latent_image_ids
|
| 578 |
+
|
| 579 |
+
def main():
|
| 580 |
+
parser = transformers.HfArgumentParser(InferenceConfig)
|
| 581 |
+
config: InferenceConfig = parser.parse_args_into_dataclasses()[0]
|
| 582 |
+
model = DreamFuseInference(config)
|
| 583 |
+
os.makedirs(config.valid_output_dir, exist_ok=True)
|
| 584 |
+
for valid_root, valid_json in zip(config.valid_roots, config.valid_jsons):
|
| 585 |
+
with open(valid_json, 'r') as f:
|
| 586 |
+
valid_info = json.load(f)
|
| 587 |
+
|
| 588 |
+
# multi gpu
|
| 589 |
+
to_process = sorted(list(valid_info.keys()))
|
| 590 |
+
|
| 591 |
+
# debug
|
| 592 |
+
to_process = [k for k in to_process if "data_wear" in k and "pixelwave" in k]
|
| 593 |
+
# debug
|
| 594 |
+
|
| 595 |
+
sd_idx = len(to_process) // config.total_num * config.sub_idx
|
| 596 |
+
ed_idx = len(to_process) // config.total_num * (config.sub_idx + 1)
|
| 597 |
+
if config.sub_idx < config.total_num - 1:
|
| 598 |
+
print(config.sub_idx, sd_idx, ed_idx)
|
| 599 |
+
to_process = to_process[sd_idx:ed_idx]
|
| 600 |
+
else:
|
| 601 |
+
print(config.sub_idx, sd_idx)
|
| 602 |
+
to_process = to_process[sd_idx:]
|
| 603 |
+
valid_info = {k: valid_info[k] for k in to_process}
|
| 604 |
+
|
| 605 |
+
for meta_key, info in tqdm(valid_info.items()):
|
| 606 |
+
img_name = meta_key.split('/')[-1]
|
| 607 |
+
|
| 608 |
+
foreground_img = Image.open(os.path.join(valid_root, info['img_info']['000']))
|
| 609 |
+
background_img = Image.open(os.path.join(valid_root, info['img_info']['001']))
|
| 610 |
+
|
| 611 |
+
new_fg_mask = Image.open(os.path.join(valid_root, info['img_mask_info']['000_mask_scale']))
|
| 612 |
+
ori_fg_mask = Image.open(os.path.join(valid_root, info['img_mask_info']['000']))
|
| 613 |
+
|
| 614 |
+
# debug
|
| 615 |
+
foreground_img.save(os.path.join(config.valid_output_dir, f"{img_name}_0.png"))
|
| 616 |
+
background_img.save(os.path.join(config.valid_output_dir, f"{img_name}_1.png"))
|
| 617 |
+
ori_fg_mask.save(os.path.join(config.valid_output_dir, f"{img_name}_0_mask.png"))
|
| 618 |
+
new_fg_mask.save(os.path.join(config.valid_output_dir, f"{img_name}_0_mask_scale.png"))
|
| 619 |
+
# debug
|
| 620 |
+
|
| 621 |
+
foreground_img.paste((255, 255, 255), mask=ImageOps.invert(ori_fg_mask))
|
| 622 |
+
|
| 623 |
+
images = model(foreground_img.copy(), background_img.copy(),
|
| 624 |
+
ori_fg_mask, new_fg_mask,
|
| 625 |
+
prompt=config.ref_prompts,
|
| 626 |
+
seed=config.seed,
|
| 627 |
+
cfg=config.guidance_scale,
|
| 628 |
+
size_select=config.inference_scale,
|
| 629 |
+
text_strength=config.text_strength,
|
| 630 |
+
truecfg=config.truecfg)
|
| 631 |
+
|
| 632 |
+
result_image = Image.fromarray(images[0], "RGB")
|
| 633 |
+
result_image = result_image.resize(background_img.size)
|
| 634 |
+
result_image.save(os.path.join(config.valid_output_dir, f"{img_name}_2.png"))
|
| 635 |
+
# Make grid
|
| 636 |
+
grid_image = [foreground_img, background_img] + [result_image]
|
| 637 |
+
result = make_image_grid(grid_image, 1, len(grid_image), size=result_image.size)
|
| 638 |
+
|
| 639 |
+
result.save(os.path.join(config.valid_output_dir, f"{img_name}.jpg"))
|
| 640 |
+
|
| 641 |
+
if __name__ == "__main__":
|
| 642 |
+
main()
|
examples/9_01.png
ADDED
|
Git LFS Details
|
examples/9_02.png
ADDED
|
Git LFS Details
|
output_images/no_bg_image.png
ADDED
|
Git LFS Details
|
requirements.txt
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
lmdb==1.4.1
|
| 2 |
+
tqdm==4.66.1
|
| 3 |
+
datasets
|
| 4 |
+
tensorboardX
|
| 5 |
+
accelerate
|
| 6 |
+
ninja
|
| 7 |
+
transformers==4.46.3
|
| 8 |
+
pycocotools==2.0.7
|
| 9 |
+
scikit-image
|
| 10 |
+
Pillow==9.5.0
|
| 11 |
+
opencv-python
|
| 12 |
+
opencv-python-headless
|
| 13 |
+
datasets
|
| 14 |
+
einops==0.8.0
|
| 15 |
+
sentencepiece
|
| 16 |
+
pydantic==2.9.2
|
| 17 |
+
deepspeed
|
| 18 |
+
peft==0.14.0
|
| 19 |
+
diffusers==0.32.0
|
| 20 |
+
rotary-embedding-torch==0.8.4
|
| 21 |
+
tiktoken==0.8.0
|
| 22 |
+
transformers_stream_generator==0.0.5
|
| 23 |
+
ftfy
|
| 24 |
+
bs4
|
| 25 |
+
bson==0.5.10
|
| 26 |
+
gradio==5.12.0
|
| 27 |
+
httpx
|
| 28 |
+
fairscale==0.4.13
|
| 29 |
+
kornia
|
| 30 |
+
timm==1.0.9
|
| 31 |
+
protobuf==3.20.0
|
| 32 |
+
basicsr
|
| 33 |
+
sentencepiece
|
| 34 |
+
huggingface_hub
|
| 35 |
+
prodigyopt
|
| 36 |
+
torch==2.4.0
|
| 37 |
+
torchvision==0.19.0
|