Spaces:

tori29umai
/

Stick2Body

Running on Zero

App Files Files Community

tori29umai commited on Aug 2, 2024

Commit

dce20cd

1 Parent(s): 7f60683

Add application file

Browse files

Files changed (7) hide show

app.py +98 -0
config.json +57 -0
requirements.txt +24 -0
utils/dl_utils.py +72 -0
utils/image_utils.py +64 -0
utils/prompt_utils.py +28 -0
utils/tagger.py +137 -0

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import spaces
+import gradio as gr
+import torch
+from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, AutoencoderKL
+from PIL import Image
+import os
+import time
+from utils.dl_utils import dl_cn_model, dl_cn_config, dl_lora_model
+from utils.image_utils import resize_image_aspect_ratio, base_generation
+from utils.prompt_utils import remove_duplicates
+path = os.getcwd()
+cn_dir = f"{path}/controlnet"
+lora_dir = f"{path}/lora"
+os.makedirs(cn_dir, exist_ok=True)
+os.makedirs(lora_dir, exist_ok=True)
+dl_cn_model(cn_dir)
+dl_cn_config(cn_dir)
+dl_lora_model(lora_dir)
+def load_model(lora_dir, cn_dir):
+    dtype = torch.float16
+    vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+    controlnet = ControlNetModel.from_pretrained(cn_dir, torch_dtype=dtype, use_safetensors=True)
+    pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
+        "cagliostrolab/animagine-xl-3.1", controlnet=controlnet, vae=vae, torch_dtype=torch.float16
+    )
+    pipe.enable_model_cpu_offload()
+    pipe.load_lora_weights(lora_dir, weight_name="Fixhands_anime_bdsqlsz_V1.safetensors")
+    return pipe
+@spaces.GPU(duration=120)
+def predict(input_image_path, prompt, negative_prompt, controlnet_scale):
+    pipe = load_model(lora_dir, cn_dir)
+    input_image = Image.open(input_image_path)
+    base_image = base_generation(input_image.size, (255, 255, 255, 255)).convert("RGB")
+    resize_image = resize_image_aspect_ratio(input_image)
+    resize_base_image = resize_image_aspect_ratio(base_image)
+    generator = torch.manual_seed(0)
+    last_time = time.time()
+    prompt = "masterpiece, best quality, simple background, white background, bald, nude, " + prompt
+    prompt = remove_duplicates(prompt)
+    print(prompt)
+    output_image = pipe(
+        image=resize_base_image,
+        control_image=resize_image,
+        strength=1.0,
+        prompt=prompt,
+        negative_prompt = negative_prompt,
+        controlnet_conditioning_scale=float(controlnet_scale),
+        generator=generator,
+        num_inference_steps=30,
+        eta=1.0,
+    ).images[0]
+    print(f"Time taken: {time.time() - last_time}")
+    output_image = output_image.resize(input_image.size, Image.LANCZOS)
+    return output_image
+class Img2Img:
+    def __init__(self):
+        self.demo = self.layout()
+        self.tagger_model = None
+        self.input_image_path = None
+        self.canny_image = None
+    def layout(self):
+        css = """
+        #intro{
+            max-width: 32rem;
+            text-align: center;
+            margin: 0 auto;
+        }
+        """
+        with gr.Blocks(css=css) as demo:
+            with gr.Row():
+                with gr.Column():
+                    self.input_image_path = gr.Image(label="input_image", type='filepath')
+                    self.prompt = gr.Textbox(label="prompt", lines=3)
+                    self.negative_prompt = gr.Textbox(label="negative_prompt", lines=3, value="nsfw, nipples, bad anatomy, liquid fingers, low quality, worst quality, out of focus, ugly, error, jpeg artifacts, lowers, blurry, bokeh")
+                    self.controlnet_scale = gr.Slider(minimum=0.5, maximum=2.0, value=1.0, step=0.01, label="Stick_fidelity")
+                    generate_button = gr.Button("generate")
+                with gr.Column():
+                    self.output_image = gr.Image(type="pil", label="output_image")
+            generate_button.click(
+                fn=predict,
+                inputs=[self.input_image_path, self.prompt, self.negative_prompt, self.controlnet_scale],
+                outputs=self.output_image
+            )
+        return demo
+img2img = Img2Img()
+img2img.demo.launch(share=True)

config.json ADDED Viewed

	@@ -0,0 +1,57 @@

+{
+  "_class_name": "ControlNetModel",
+  "_diffusers_version": "0.27.2",
+  "act_fn": "silu",
+  "addition_embed_type": "text_time",
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": 256,
+  "attention_head_dim": [
+    5,
+    10,
+    20
+  ],
+  "block_out_channels": [
+    320,
+    640,
+    1280
+  ],
+  "class_embed_type": null,
+  "conditioning_channels": 3,
+  "conditioning_embedding_out_channels": [
+    16,
+    32,
+    96,
+    256
+  ],
+  "controlnet_conditioning_channel_order": "rgb",
+  "cross_attention_dim": 2048,
+  "down_block_types": [
+    "DownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "global_pool_conditions": false,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossAttn",
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "projection_class_embeddings_input_dim": 2816,
+  "resnet_time_scale_shift": "default",
+  "transformer_layers_per_block": [
+    1,
+    2,
+    10
+  ],
+  "upcast_attention": null,
+  "use_linear_projection": true
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+gradio==4.29.0
+accelerate
+transformers
+torchvision
+xformers
+accelerate
+invisible-watermark
+huggingface-hub
+hf-transfer
+compel
+opencv-python
+numpy
+diffusers==0.27.0
+transformers
+accelerate
+safetensors
+hidiffusion==0.1.8
+spaces
+torch==2.2
+controlnet-aux==0.0.9
+onnx==1.16.1
+onnxruntime==1.18.0
+mediapipe==0.10.14
+peft==0.11.1

utils/dl_utils.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import os
+import requests
+from tqdm import tqdm
+import shutil
+from PIL import Image, ImageOps
+import numpy as np
+import cv2
+def dl_cn_model(model_dir):
+    folder = model_dir
+    file_name = 'diffusion_pytorch_model.safetensors'
+    url = "https://huggingface.co/tori29umai/CN_pose3D_V7/resolve/main/CN_pose3D_V7_marged/CN_pose3D_V7_marged.safetensors"
+    file_path = os.path.join(folder, file_name)
+    if not os.path.exists(file_path):
+        response = requests.get(url, allow_redirects=True)
+        if response.status_code == 200:
+            with open(file_path, 'wb') as f:
+                f.write(response.content)
+            print(f'Downloaded {file_name}')
+        else:
+            print(f'Failed to download {file_name}')
+    else:
+        print(f'{file_name} already exists.')
+def dl_cn_config(model_dir):
+  folder = model_dir
+  file_name = 'config.json'
+  file_path = os.path.join(folder, file_name)
+  if not os.path.exists(file_path):
+     config_path = os.path.join(os.getcwd(), file_name)
+     shutil.copy(config_path, file_path)
+def dl_tagger_model(model_dir):
+    model_id = 'SmilingWolf/wd-vit-tagger-v3'
+    files = [
+        'config.json', 'model.onnx', 'selected_tags.csv', 'sw_jax_cv_config.json'
+    ]
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    for file in files:
+        file_path = os.path.join(model_dir, file)
+        if not os.path.exists(file_path):
+            url = f'https://huggingface.co/{model_id}/resolve/main/{file}'
+            response = requests.get(url, allow_redirects=True)
+            if response.status_code == 200:
+                with open(file_path, 'wb') as f:
+                    f.write(response.content)
+                print(f'Downloaded {file}')
+            else:
+                print(f'Failed to download {file}')
+        else:
+            print(f'{file} already exists.')
+def dl_lora_model(model_dir):
+    file_name = 'Fixhands_anime_bdsqlsz_V1.safetensors'
+    file_path = os.path.join(model_dir, file_name)
+    if not os.path.exists(file_path):
+        url = "https://huggingface.co/bdsqlsz/stable-diffusion-xl-anime-V5/resolve/main/Fixhands_anime_bdsqlsz_V1.safetensors"
+        response = requests.get(url, allow_redirects=True)
+        if response.status_code == 200:
+            with open(file_path, 'wb') as f:
+                f.write(response.content)
+            print(f'Downloaded {file_name}')
+        else:
+            print(f'Failed to download {file_name}')
+    else:
+        print(f'{file_name} already exists.')

utils/image_utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from PIL import Image, ImageOps
+import numpy as np
+import cv2
+def canny_process(image_path, threshold1, threshold2):
+    # 画像を開き、RGBA形式に変換して透過情報を保持
+    img = Image.open(image_path)
+    img = img.convert("RGBA")
+    canvas_image = Image.new('RGBA', img.size, (255, 255, 255, 255))
+    # 画像をキャンバスにペーストし、透過部分が白色になるように設定
+    canvas_image.paste(img, (0, 0), img)
+    # RGBAからRGBに変換し、透過部分を白色にする
+    image_pil = canvas_image.convert("RGB")
+    image_np = np.array(image_pil)
+    # グレースケール変換
+    gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
+    # Cannyエッジ検出
+    edges = cv2.Canny(gray, threshold1, threshold2)
+    canny = Image.fromarray(edges)
+    return canny
+def resize_image_aspect_ratio(image):
+    # 元の画像サイズを取得
+    original_width, original_height = image.size
+    # アスペクト比を計算
+    aspect_ratio = original_width / original_height
+    # 標準のアスペクト比サイズを定義
+    sizes = {
+        1: (1024, 1024),  # 正方形
+        4/3: (1152, 896),  # 横長画像
+        3/2: (1216, 832),
+        16/9: (1344, 768),
+        21/9: (1568, 672),
+        3/1: (1728, 576),
+        1/4: (512, 2048),  # 縦長画像
+        1/3: (576, 1728),
+        9/16: (768, 1344),
+        2/3: (832, 1216),
+        3/4: (896, 1152)
+    }
+    # 最も近いアスペクト比を見つける
+    closest_aspect_ratio = min(sizes.keys(), key=lambda x: abs(x - aspect_ratio))
+    target_width, target_height = sizes[closest_aspect_ratio]
+    # リサイズ処理
+    resized_image = image.resize((target_width, target_height), Image.LANCZOS)
+    return resized_image
+def base_generation(size, color):
+    canvas = Image.new("RGBA", size, color)
+    return canvas

utils/prompt_utils.py ADDED Viewed

	@@ -0,0 +1,28 @@

+def remove_duplicates(base_prompt):
+    # タグの重複を取り除く
+    prompt_list = base_prompt.split(", ")
+    seen = set()
+    unique_tags = []
+    for tag in prompt_list :
+        tag_clean = tag.lower().strip()
+        if tag_clean not in seen and tag_clean != "":
+            unique_tags.append(tag)
+            seen.add(tag_clean)
+    return ", ".join(unique_tags)
+def remove_color(base_prompt):
+    # タグの色情報を取り除く
+    prompt_list = base_prompt.split(", ")
+    color_list = ["pink", "red", "orange", "brown", "yellow", "green", "blue", "purple", "blonde", "colored skin", "white hair"]
+    # カラータグを除去します。
+    cleaned_tags = [tag for tag in prompt_list if all(color.lower() not in tag.lower() for color in color_list)]
+    return ", ".join(cleaned_tags)
+def execute_prompt(execute_tags, base_prompt):
+    prompt_list = base_prompt.split(", ")
+    # execute_tagsを除去
+    filtered_tags = [tag for tag in prompt_list if tag not in execute_tags]
+    # 最終的なプロンプトを生成
+    return ", ".join(filtered_tags)

utils/tagger.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# -*- coding: utf-8 -*-
+# https://github.com/kohya-ss/sd-scripts/blob/main/finetune/tag_images_by_wd14_tagger.py
+import csv
+import os
+os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
+from PIL import Image
+import cv2
+import numpy as np
+from pathlib import Path
+import onnx
+import onnxruntime as ort
+# from wd14 tagger
+IMAGE_SIZE = 448
+model = None  # Initialize model variable
+def convert_array_to_bgr(array):
+    """
+    Convert a NumPy array image to BGR format regardless of its original format.
+    Parameters:
+    - array: NumPy array of the image.
+    Returns:
+    - A NumPy array representing the image in BGR format.
+    """
+    # グレースケール画像（2次元配列）
+    if array.ndim == 2:
+        # グレースケールをBGRに変換（3チャンネルに拡張）
+        bgr_array = np.stack((array,) * 3, axis=-1)
+    # RGBAまたはRGB画像（3次元配列）
+    elif array.ndim == 3:
+        # RGBA画像の場合、アルファチャンネルを削除
+        if array.shape[2] == 4:
+            array = array[:, :, :3]
+        # RGBをBGRに変換
+        bgr_array = array[:, :, ::-1]
+    else:
+        raise ValueError("Unsupported array shape.")
+    return bgr_array
+def preprocess_image(image):
+    image = np.array(image)
+    image = convert_array_to_bgr(image)
+    size = max(image.shape[0:2])
+    pad_x = size - image.shape[1]
+    pad_y = size - image.shape[0]
+    pad_l = pad_x // 2
+    pad_t = pad_y // 2
+    image = np.pad(image, ((pad_t, pad_y - pad_t), (pad_l, pad_x - pad_l), (0, 0)), mode="constant", constant_values=255)
+    interp = cv2.INTER_AREA if size > IMAGE_SIZE else cv2.INTER_LANCZOS4
+    image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE), interpolation=interp)
+    image = image.astype(np.float32)
+    return image
+def modelLoad(model_dir):
+    onnx_path = os.path.join(model_dir, "model.onnx")
+    # 実行プロバイダーをCPUのみに指定
+    providers = ['CPUExecutionProvider']
+    # InferenceSessionの作成時にプロバイダーのリストを指定
+    ort_session = ort.InferenceSession(onnx_path, providers=providers)
+    input_name = ort_session.get_inputs()[0].name
+    # 実際に使用されているプロバイダーを取得して表示
+    actual_provider = ort_session.get_providers()[0]  # 使用されているプロバイダー
+    print(f"Using provider: {actual_provider}")
+    return [ort_session, input_name]
+def analysis(image_path, model_dir, model):
+    ort_session = model[0]
+    input_name = model[1]
+    with open(os.path.join(model_dir, "selected_tags.csv"), "r", encoding="utf-8") as f:
+        reader = csv.reader(f)
+        l = [row for row in reader]
+        header = l[0]  # tag_id,name,category,count
+        rows = l[1:]
+    assert header[0] == "tag_id" and header[1] == "name" and header[2] == "category", f"unexpected csv format: {header}"
+    general_tags = [row[1] for row in rows[1:] if row[2] == "0"]
+    character_tags = [row[1] for row in rows[1:] if row[2] == "4"]
+    tag_freq = {}
+    undesired_tags = ["transparent background"]
+    image_pil = Image.open(image_path)
+    image_preprocessed = preprocess_image(image_pil)
+    image_preprocessed = np.expand_dims(image_preprocessed, axis=0)
+    # 推論を実行
+    prob = ort_session.run(None, {input_name: image_preprocessed})[0][0]
+    # タグを生成
+    combined_tags = []
+    general_tag_text = ""
+    character_tag_text = ""
+    remove_underscore = True
+    caption_separator = ", "
+    general_threshold = 0.35
+    character_threshold = 0.35
+    for i, p in enumerate(prob[4:]):
+        if i < len(general_tags) and p >= general_threshold:
+            tag_name = general_tags[i]
+            if remove_underscore and len(tag_name) > 3:  # ignore emoji tags like >_< and ^_^
+                tag_name = tag_name.replace("_", " ")
+            if tag_name not in undesired_tags:
+                tag_freq[tag_name] = tag_freq.get(tag_name, 0) + 1
+                general_tag_text += caption_separator + tag_name
+                combined_tags.append(tag_name)
+        elif i >= len(general_tags) and p >= character_threshold:
+            tag_name = character_tags[i - len(general_tags)]
+            if remove_underscore and len(tag_name) > 3:
+                tag_name = tag_name.replace("_", " ")
+            if tag_name not in undesired_tags:
+                tag_freq[tag_name] = tag_freq.get(tag_name, 0) + 1
+                character_tag_text += caption_separator + tag_name
+                combined_tags.append(tag_name)
+    # 先頭のカンマを取る
+    if len(general_tag_text) > 0:
+        general_tag_text = general_tag_text[len(caption_separator) :]
+    if len(character_tag_text) > 0:
+        character_tag_text = character_tag_text[len(caption_separator) :]
+    tag_text = caption_separator.join(combined_tags)
+    return tag_text