Spaces:
Configuration error
Configuration error
| import functools | |
| import os | |
| import shutil | |
| import sys | |
| import git | |
| import gradio as gr | |
| import numpy as np | |
| import torch as torch | |
| from PIL import Image | |
| print(torch.version.cuda) | |
| os.system('locate libcusolver.so.11') | |
| from gradio_imageslider import ImageSlider | |
| from bilateral_normal_integration.bilateral_normal_integration_cupy import bilateral_normal_integration_function | |
| import spaces | |
| import fire | |
| import argparse | |
| import os | |
| import logging | |
| import numpy as np | |
| import torch | |
| from PIL import Image | |
| from tqdm.auto import tqdm | |
| import glob | |
| import json | |
| import cv2 | |
| from rembg import remove | |
| from segment_anything import sam_model_registry, SamPredictor | |
| from datetime import datetime | |
| import time | |
| import trimesh | |
| import sys | |
| sys.path.append("../") | |
| from models.geowizard_pipeline import DepthNormalEstimationPipeline | |
| from utils.seed_all import seed_all | |
| import matplotlib.pyplot as plt | |
| from utils.de_normalized import align_scale_shift | |
| from utils.depth2normal import * | |
| from diffusers import DiffusionPipeline, DDIMScheduler, AutoencoderKL | |
| from models.unet_2d_condition import UNet2DConditionModel | |
| from transformers import CLIPTextModel, CLIPTokenizer | |
| from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection | |
| import torchvision.transforms.functional as TF | |
| from torchvision.transforms import InterpolationMode | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print("Our current device is ", device) | |
| # Initialization.. Downloading the ckpts | |
| image_encoder_checkpoint1 = os.path.join(os.path.dirname(__file__), "image_encoder", "model.safetensors") | |
| image_encoder_checkpoint2 = os.path.join(os.path.dirname(__file__), "image_encoder", "pytorch_model.bin") | |
| vae_checkpoint1 = os.path.join(os.path.dirname(__file__), "vae", "diffusion_pytorch_model.bin") | |
| vae_checkpoint2 = os.path.join(os.path.dirname(__file__), "vae", "diffusion_pytorch_model.safetensors") | |
| ckpt_lists = [image_encoder_checkpoint1, image_encoder_checkpoint2, vae_checkpoint1, vae_checkpoint2] | |
| image_encoder_url1 = 'https://huggingface.co/lemonaddie/Geowizard/resolve/main/image_encoder/model.safetensors' | |
| image_encoder_url2 = 'https://huggingface.co/lemonaddie/Geowizard/resolve/main/image_encoder/pytorch_model.bin' | |
| vae_url1 = 'https://huggingface.co/lemonaddie/Geowizard/resolve/main/vae/diffusion_pytorch_model.bin' | |
| vae_url2 = 'https://huggingface.co/lemonaddie/Geowizard/resolve/main/vae/diffusion_pytorch_model.safetensors' | |
| url_lists = [image_encoder_url1, image_encoder_url2, vae_url1, vae_url2] | |
| for ckpt_path, ckpt_url in zip(ckpt_lists, url_lists): | |
| if not os.path.exists(ckpt_path): | |
| print("Downloading to " + ckpt_path + "...") | |
| os.system('wget -P' + os.path.dirname(ckpt_path) + ' -nv ' + ckpt_url) | |
| vae = AutoencoderKL.from_pretrained("./", subfolder='vae') | |
| scheduler = DDIMScheduler.from_pretrained("./", subfolder='scheduler') | |
| image_encoder = CLIPVisionModelWithProjection.from_pretrained("./", subfolder="image_encoder") | |
| feature_extractor = CLIPImageProcessor.from_pretrained("./", subfolder="feature_extractor") | |
| unet = UNet2DConditionModel.from_pretrained('.', subfolder="unet") | |
| pipe = DepthNormalEstimationPipeline(vae=vae, | |
| image_encoder=image_encoder, | |
| feature_extractor=feature_extractor, | |
| unet=unet, | |
| scheduler=scheduler) | |
| outputs_dir = "./outs" | |
| try: | |
| import xformers | |
| pipe.enable_xformers_memory_efficient_attention() | |
| except: | |
| pass # run without xformers | |
| pipe = pipe.to(device) | |
| def scale_img(img): | |
| width, height = img.size | |
| if min(width, height) > 512: | |
| scale = 512 / min(width, height) | |
| img = img.resize((int(width*scale), int(scale*height)), Image.LANCZOS) | |
| return img | |
| def sam_init(): | |
| #sam_checkpoint = os.path.join(os.path.dirname(__file__), "sam_pt", "sam_vit_l_0b3195.pth") | |
| #model_type = "vit_l" | |
| sam_checkpoint = os.path.join(os.path.dirname(__file__), "sam_vit_h_4b8939.pth") | |
| if not os.path.exists(sam_checkpoint): | |
| print("Downloading SAM-H checkpoint to " + sam_checkpoint) | |
| os.system('wget -P' + os.path.dirname(__file__) + ' -nv https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth') | |
| model_type = "vit_h" | |
| sam = sam_model_registry[model_type](checkpoint=sam_checkpoint).to(device=f"cuda") | |
| predictor = SamPredictor(sam) | |
| return predictor | |
| sam_predictor = sam_init() | |
| def sam_segment(predictor, input_image, *bbox_coords): | |
| bbox = np.array(bbox_coords) | |
| image = np.asarray(input_image) | |
| start_time = time.time() | |
| predictor.set_image(image) | |
| masks_bbox, scores_bbox, logits_bbox = predictor.predict( | |
| box=bbox, | |
| multimask_output=True | |
| ) | |
| print(f"SAM Time: {time.time() - start_time:.3f}s") | |
| out_image = np.zeros((image.shape[0], image.shape[1], 4), dtype=np.uint8) | |
| out_image[:, :, :3] = image | |
| out_image_bbox = out_image.copy() | |
| out_image_bbox[:, :, 3] = masks_bbox[-1].astype(np.uint8) * 255 | |
| torch.cuda.empty_cache() | |
| return Image.fromarray(out_image_bbox, mode='RGBA'), masks_bbox | |
| def depth_normal(img_path, | |
| denoising_steps, | |
| ensemble_size, | |
| processing_res, | |
| seed, | |
| domain): | |
| seed = int(seed) | |
| if seed >= 0: | |
| torch.manual_seed(seed) | |
| img = Image.open(img_path) | |
| img = scale_img(img) | |
| pipe_out = pipe( | |
| img, | |
| denoising_steps=denoising_steps, | |
| ensemble_size=ensemble_size, | |
| processing_res=processing_res, | |
| batch_size=0, | |
| domain=domain, | |
| show_progress_bar=True, | |
| ) | |
| depth_colored = pipe_out.depth_colored | |
| normal_colored = pipe_out.normal_colored | |
| depth_np = pipe_out.depth_np | |
| normal_np = pipe_out.normal_np | |
| path_output_dir = os.path.splitext(os.path.basename(img_path))[0] + datetime.now().strftime('%Y%m%d-%H%M%S') | |
| path_output_dir = os.path.join(path_output_dir, outputs_dir) | |
| os.makedirs(path_output_dir, exist_ok=True) | |
| name_base = os.path.splitext(os.path.basename(img_path))[0] | |
| depth_path = os.path.join(path_output_dir, f"{name_base}_depth.npy") | |
| normal_path = os.path.join(path_output_dir, f"{name_base}_normal.npy") | |
| np.save(normal_path, normal_np) | |
| np.save(depth_path, depth_np) | |
| return depth_colored, normal_colored, [depth_path, normal_path] | |
| def seg_foreground(image_file): | |
| img = Image.open(image_file) | |
| img = scale_img(img) | |
| image_rem = img.convert('RGBA') # | |
| print("after resize ", image_rem.size) | |
| image_nobg = remove(image_rem, alpha_matting=True) | |
| arr = np.asarray(image_nobg)[:,:,-1] | |
| x_nonzero = np.nonzero(arr.sum(axis=0)) | |
| y_nonzero = np.nonzero(arr.sum(axis=1)) | |
| x_min = int(x_nonzero[0].min()) | |
| y_min = int(y_nonzero[0].min()) | |
| x_max = int(x_nonzero[0].max()) | |
| y_max = int(y_nonzero[0].max()) | |
| masked_image, mask = sam_segment(sam_predictor, img.convert('RGB'), x_min, y_min, x_max, y_max) | |
| mask = Image.fromarray(np.array(mask[-1]).astype(np.uint8) * 255) | |
| return masked_image, mask | |
| def reconstruction(image_file, files): | |
| torch.cuda.empty_cache() | |
| masked_image, mask = seg_foreground(image_file) | |
| mask = np.array(mask) > 0.5 | |
| depth_np = np.load(files[0]) | |
| normal_np = np.load(files[1]) | |
| h, w, _ = np.shape(normal_np) | |
| dir_name = os.path.dirname(os.path.realpath(files[0])) | |
| mask_output_temp = mask | |
| name_base = os.path.splitext(os.path.basename(files[0]))[0][:-6] | |
| normal_np[:, :, 0] *= -1 | |
| _, surface, _, _, _ = bilateral_normal_integration_function(normal_np, mask_output_temp, k=2, K=None, max_iter=100, tol=1e-4, cg_max_iter=5000, cg_tol=1e-3) | |
| ply_path = os.path.join(outputs_dir, dir_name, f"{name_base}_recon.ply") | |
| surface.save(ply_path, binary=False) | |
| obj_path = ply_path.replace('ply', 'obj') | |
| mesh = trimesh.load(ply_path) | |
| T2 = np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]) | |
| mesh.apply_transform(T2) | |
| mesh.export(obj_path) | |
| torch.cuda.empty_cache() | |
| return obj_path, [ply_path], masked_image | |
| # @spaces.GPU | |
| def run_demo(): | |
| custom_theme = gr.themes.Soft(primary_hue="blue").set( | |
| button_secondary_background_fill="*neutral_100", | |
| button_secondary_background_fill_hover="*neutral_200") | |
| custom_css = '''#disp_image { | |
| text-align: center; /* Horizontally center the content */ | |
| }''' | |
| _TITLE = '''GeoWizard: Unleashing the Diffusion Priors for 3D Geometry Estimation from a Single Image''' | |
| _DESCRIPTION = ''' | |
| <div> | |
| Generate consistent depth and normal from single image. High quality and rich details. (PS: We find the demo running on ZeroGPU output slightly inferior results compared to A100 or 3060 with everything exactly the same.) | |
| <a style="display:inline-block; margin-left: .5em" href='https://github.com/fuxiao0719/GeoWizard/'><img src='https://img.shields.io/github/stars/fuxiao0719/GeoWizard?style=social' /></a> | |
| </div> | |
| ''' | |
| _GPU_ID = 0 | |
| with gr.Blocks(title=_TITLE, theme=custom_theme, css=custom_css) as demo: | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown('# ' + _TITLE) | |
| gr.Markdown(_DESCRIPTION) | |
| with gr.Row(variant='panel'): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image(type='filepath', height=320, label='Input image') | |
| example_folder = os.path.join(os.path.dirname(__file__), "./files") | |
| example_fns = [os.path.join(example_folder, example) for example in os.listdir(example_folder)] | |
| gr.Examples( | |
| examples=example_fns, | |
| inputs=[input_image], | |
| cache_examples=False, | |
| label='Examples (click one of the images below to start)', | |
| examples_per_page=30 | |
| ) | |
| with gr.Column(scale=1): | |
| with gr.Accordion('Advanced options', open=True): | |
| with gr.Column(): | |
| domain = gr.Radio( | |
| [ | |
| ("Outdoor", "outdoor"), | |
| ("Indoor", "indoor"), | |
| ("Object", "object"), | |
| ], | |
| label="Data Type (Must Select One matches your image)", | |
| value="indoor", | |
| ) | |
| denoising_steps = gr.Slider( | |
| label="Number of denoising steps (More steps, better quality)", | |
| minimum=1, | |
| maximum=50, | |
| step=1, | |
| value=10, | |
| ) | |
| ensemble_size = gr.Slider( | |
| label="Ensemble size (More steps, higher accuracy)", | |
| minimum=1, | |
| maximum=15, | |
| step=1, | |
| value=3, | |
| ) | |
| seed = gr.Number(0, label='Random Seed. Negative values for not specifying') | |
| processing_res = gr.Radio( | |
| [ | |
| ("Native", 0), | |
| ("Recommended", 768), | |
| ], | |
| label="Processing resolution", | |
| value=768, | |
| ) | |
| run_btn = gr.Button('Generate', variant='primary', interactive=True) | |
| with gr.Row(): | |
| with gr.Column(): | |
| depth = gr.Image(interactive=False, show_label=False) | |
| with gr.Column(): | |
| normal = gr.Image(interactive=False, show_label=False) | |
| with gr.Column(): | |
| masked_image = gr.Image(interactive=False, label="Masked foreground.") | |
| with gr.Row(): | |
| files = gr.Files( | |
| label = "Depth and Normal (numpy)", | |
| elem_id = "download", | |
| interactive=False, | |
| ) | |
| with gr.Row(): | |
| recon_btn = gr.Button('(Beta) Is there a salient foreground object? If yes, Click here to Reconstruct its 3D model.', variant='primary', interactive=True) | |
| with gr.Row(): | |
| reconstructed_3d = gr.Model3D( | |
| label = 'Bini post-processed 3D model', interactive=False | |
| ) | |
| with gr.Row(): | |
| reconstructed_file = gr.Files( | |
| label = "3D Mesh (plyfile)", | |
| elem_id = "download", | |
| interactive=False | |
| ) | |
| mask = gr.Image(interactive=False, label="Masked foreground.", visible=False) | |
| run_btn.click(fn=depth_normal, | |
| inputs=[input_image, denoising_steps, | |
| ensemble_size, | |
| processing_res, | |
| seed, | |
| domain], | |
| outputs=[depth, normal, files] | |
| ) | |
| recon_btn.click(fn=reconstruction, | |
| inputs=[input_image, files], | |
| outputs=[reconstructed_3d, reconstructed_file, masked_image] | |
| ) | |
| demo.queue().launch(share=True, max_threads=80) | |
| if __name__ == '__main__': | |
| fire.Fire(run_demo) | |