FLUX.2-dev

Running on Zero

File size: 8,778 Bytes

7f4c99b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06529b5
 
 
 
 
 
7f4c99b
06529b5
 
 
7f4c99b
 
 
aa0cb15
 
7f4c99b
 
 
 
 
 
 
 
 
 
 
 
 
 
fc39399
 
 
 
 
 
 
 
 
 
 
7f4c99b
0fb3ed4
 
 
 
02f82c1
0fb3ed4
 
 
75ba08b
7f4c99b
 
 
ce80858
7f4c99b
ce80858
7f4c99b
ce80858
7f4c99b
 
 
 
 
 
ce80858
7f4c99b
ce80858
7f4c99b
 
 
 
 
 
 
 
 
 
 
 
 
 
c452fe5
 
 
 
7f4c99b
 
 
 
c452fe5
7f4c99b
 
 
 
 
 
 
 
 
 
ce80858
7f4c99b
 
ce80858
7f4c99b
 
ce80858
7f4c99b
ce80858
7f4c99b
 
 
 
ce80858
80ddbec
 
 
 
 
 
 
 
 
 
ce80858
 
 
 
 
7f4c99b
ce80858
7f4c99b
 
 
 
 
ce80858
 
 
7f4c99b
ce80858
7f4c99b
ce80858
7f4c99b
 
 
 
 
ce80858
7f4c99b
ce80858
7f4c99b
 
 
 
 
ce80858
7f4c99b
ce80858
7f4c99b
ce80858
7f4c99b
 
 
 
 
ce80858
7f4c99b
ce80858
7f4c99b
 
 
 
 
 
 
 
 
2b7d18b
7f4c99b

import os
import subprocess
import sys
import io
import gradio as gr
import numpy as np
import random
import spaces
import torch
from diffusers import Flux2Pipeline, Flux2Transformer2DModel
from diffusers import BitsAndBytesConfig as DiffBitsAndBytesConfig
from optimization import optimize_pipeline_
import requests
from PIL import Image
import json

dtype = torch.bfloat16
device = "cuda" if torch.cuda.is_available() else "cpu"

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1024

def remote_text_encoder(prompts):
    from gradio_client import Client
    
    client = Client("multimodalart/mistral-text-encoder")
    result = client.predict(
        prompt=prompts,
        api_name="/encode_text"
    )
    
    prompt_embeds = torch.load(result[0])
    
    return prompt_embeds

# Load model
repo_id = "black-forest-labs/FLUX.2-dev"

dit = Flux2Transformer2DModel.from_pretrained(
    repo_id,
    subfolder="transformer",
    torch_dtype=torch.bfloat16
)

pipe = Flux2Pipeline.from_pretrained(
    repo_id,
    text_encoder=None,
    transformer=dit,
    torch_dtype=torch.bfloat16
)
pipe.to("cuda")

pipe.transformer.set_attention_backend("_flash_3_hub")

optimize_pipeline_(
    pipe,
    image=[Image.new("RGB", (1024, 1024))],
    prompt_embeds = remote_text_encoder("prompt").to("cuda"),
    guidance_scale=2.5,
    width=1024,
    height=1024,
    num_inference_steps=1
)


def get_duration(prompt, input_images=None, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=50, guidance_scale=2.5, progress=gr.Progress(track_tqdm=True)):
    num_images = 0 if input_images is None else len(input_images)
    step_duration = 1 + 0.7 * num_images
    return max(65, num_inference_steps * step_duration + 10)


@spaces.GPU(duration=get_duration)
def infer(prompt, input_images=None, seed=42, randomize_seed=False, width=1024, height=1024, num_inference_steps=50, guidance_scale=2.5, progress=gr.Progress(track_tqdm=True)):
    
    if randomize_seed:
        seed = random.randint(0, MAX_SEED)

    # Get prompt embeddings from remote text encoder
    progress(0.1, desc="프롬프트 인코딩 중...")
    prompt_embeds = remote_text_encoder(prompt).to("cuda")

    # Prepare image list (convert None or empty gallery to None)
    image_list = None
    if input_images is not None and len(input_images) > 0:
        image_list = []
        for item in input_images:
            image_list.append(item[0])

    # Generate image
    progress(0.3, desc="이미지 생성 중...")
    generator = torch.Generator(device=device).manual_seed(seed)
    image = pipe(
        prompt_embeds=prompt_embeds,
        image=image_list,
        width=width,
        height=height,
        num_inference_steps=num_inference_steps,
        guidance_scale=guidance_scale,
        generator=generator,
    ).images[0]
    
    return image, seed

examples = [
    ["거실 탁자 위에 꽃병을 만들어 주세요. 꽃병의 색상은 #02eb3c 색상에서 시작하여 #edfa3c로 끝나는 그라데이션입니다. 꽃병 안의 꽃들은 #ff0088 색상입니다"],
    ["베를린 TV 타워(Fernsehturm)의 전체 구조를 지면 기초부터 안테나 끝까지 보여주는 사진처럼 사실적인 인포그래픽, 콘크리트 축, 금속 구체, 안테나 첨탑을 포함한 전체 구조가 보이는 수직 전체 뷰. 상징적인 구체를 올려다보는 약간의 위쪽 원근감 각도, 깨끗한 흰색 배경에 완벽하게 중앙 배치. 얇은 수평 연결선이 있는 왼쪽 레이블: 매우 큰 굵은 진한 회색 숫자(#2D3748)로 된 '368m' 텍스트가 안테나 끝에 정확히 위치하고 그 아래에 작은 대문자로 'TOTAL HEIGHT'가 있음. 매우 큰 굵은 글씨로 된 '207m' 텍스트와 그 아래 작은 대문자로 'TELECAFÉ'가 있으며, 연결선이 창문 높이의 구체에 정확히 닿아 있음. 구체의 적도에 닿는 수평 연결선이 있는 오른쪽 레이블: 매우 큰 굵은 진한 회색 숫자로 된 '32m' 텍스트와 그 아래 작은 대문자로 'SPHERE DIAMETER'가 있음. 세 개의 균형 잡힌 열로 배열된 하단 섹션: 왼쪽 - 매우 굵은 진한 회색의 큰 텍스트 '986'과 그 아래 대문자로 'STEPS'. 중앙 - 굵은 대문자로 'BERLIN TV TOWER'와 그 아래 가벼운 무게로 'FERNSEHTURM'. 오른쪽 - 굵은 대문자로 'INAUGURATED'와 그 아래 'OCTOBER 3, 1969'. 모든 타이포그래피는 현대적인 산세리프 폰트(Inter 또는 Helvetica 같은), 색상 #2D3748, 깨끗하고 미니멀한 기술 다이어그램 스타일. 수평 연결선은 얇고 정확하며 명확하게 보이고 타워 구조의 정확한 해당 측정 지점에 닿아 있음. 높이와 웅장함을 느낄 수 있는 역동적인 낮은 각도 원근감이 있는 전문적인 건축 입면도 미학, 완벽한 시각적 계층 구조를 가진 포스터급 인포그래픽 디자인."],
    ["비오는 정글에서 바나나 잎 아래 피신하고 있는 흠뻑 젖은 카피바라, 클로즈업 사진"],
    ["통통한 주황색 고양이의 카와이 다이컷 스티커, 크고 반짝이는 눈과 인사하며 발을 올린 행복한 미소와 하트 모양의 분홍 코가 있습니다. 디자인은 검은색 윤곽선과 분홍 볼이 있는 부드러운 그라데이션 음영이 있는 부드러운 둥근 선이 있어야 합니다."],
]

examples_images = [
    # ["Replace the top of the person from image 1 with the one from image 2", ["person1.webp", "woman2.webp"]],
    ["이미지 1의 사람이 이미지 2의 고양이를 쓰다듬고 있고, 이미지 3의 새가 그들 옆에 있습니다", ["woman1.webp", "cat_window.webp", "bird.webp"]]
]

css="""
#col-container {
    margin: 0 auto;
    max-width: 620px;
}
"""

with gr.Blocks() as demo:

    with gr.Column(elem_id="col-container"):
        gr.Markdown(f"""# FLUX.2 [dev]
FLUX.2 [dev]는 텍스트 지시사항을 기반으로 이미지를 생성, 편집 및 결합할 수 있는 32B 파라미터 rectified flow 모델입니다 [[모델](https://huggingface.co/black-forest-labs/FLUX.2-dev)], [[블로그](https://bfl.ai/blog/flux-2)]
        """)

        with gr.Accordion("입력 이미지 (선택사항)", open=False):
            input_images = gr.Gallery(
                label="입력 이미지",
                type="pil",
                columns=3,
                rows=1,
            )

        prompt = gr.Text(
            label="프롬프트",
            show_label=False,
            lines=10,
            max_lines=15,
            placeholder="프롬프트를 입력하세요",
            container=False,
        )

        run_button = gr.Button("실행")

        result = gr.Image(label="결과", show_label=False)

        with gr.Accordion("고급 설정", open=False):

            seed = gr.Slider(
                label="시드",
                minimum=0,
                maximum=MAX_SEED,
                step=1,
                value=0,
            )

            randomize_seed = gr.Checkbox(label="랜덤 시드", value=True)

            with gr.Row():

                width = gr.Slider(
                    label="너비",
                    minimum=256,
                    maximum=MAX_IMAGE_SIZE,
                    step=32,
                    value=1024,
                )

                height = gr.Slider(
                    label="높이",
                    minimum=256,
                    maximum=MAX_IMAGE_SIZE,
                    step=32,
                    value=1024,
                )

            with gr.Row():

                num_inference_steps = gr.Slider(
                    label="추론 단계 수",
                    minimum=1,
                    maximum=100,
                    step=1,
                    value=30,
                )

                guidance_scale = gr.Slider(
                    label="가이던스 스케일",
                    minimum=0.0,
                    maximum=10.0,
                    step=0.1,
                    value=4,
                )
        
        gr.Examples(
            examples=examples,
            fn=infer,
            inputs=[prompt],
            outputs=[result, seed],
            cache_examples=True,
            cache_mode="lazy"
        )

        gr.Examples(
            examples=examples_images,
            fn=infer,
            inputs=[prompt, input_images],
            outputs=[result, seed],
            cache_examples=True,
            cache_mode="lazy"
        )

    gr.on(
        triggers=[run_button.click, prompt.submit],
        fn=infer,
        inputs=[prompt, input_images, seed, randomize_seed, width, height, num_inference_steps, guidance_scale],
        outputs=[result, seed]
    )

demo.launch(css=css)